You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Scraping on Facebook, when a group has more than 16 members, it crashes
CODE:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import time
import threading
import json
import colorama
from colorama import Fore, Style
import win32gui
import win32con
import msvcrt
from selenium.webdriver.common.action_chains import ActionChains
from datetime import datetime
chrome_options = Options()
chrome_options.add_argument("--headless=new") # Usar el nuevo modo headless
chrome_options.add_argument("--window-size=1920,1080") # Establecer resolución
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-software-rasterizer")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_experimental_option("detach", True)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.page_load_strategy = 'none'
driver_path = r"C:\chromedriver\chromedriver.exe"
service = Service(driver_path)
def is_cmd_window_active():
"""Verifica si la ventana de CMD está activa"""
try:
return win32gui.GetForegroundWindow() == win32gui.GetConsoleWindow()
except:
return True
def check_cmd_input():
"""Monitorea las teclas solo cuando CMD está activa"""
global extraction_active, is_paused, pause_start_time, total_pause_time
while extraction_active:
if is_cmd_window_active():
if msvcrt.kbhit():
key = msvcrt.getch().decode('utf-8').lower()
if key == '\r': # Enter
extraction_active = False
print(f"\n{Fore.YELLOW}Deteniendo extracción...{Style.RESET_ALL}")
break
elif key == ' ': # Espacio
is_paused = not is_paused
if is_paused:
pause_start_time = time.time()
estado = "⏸️ PAUSADO"
else:
if pause_start_time:
total_pause_time += time.time() - pause_start_time
estado = "▶️ CONTINUANDO"
print(f"\n{Fore.YELLOW}{estado}{Style.RESET_ALL}")
time.sleep(0.1)
def navegar_a_miembros(driver, url):
try:
driver.get(url)
time.sleep(3) # Esperar a que cargue la página inicial del grupo
# Buscar y hacer clic en el enlace de "people" o "members"
try:
people_link = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[href*="/members/"]'))
)
people_link.click()
time.sleep(3) # Esperar a que cargue la página de miembros
return True
except TimeoutException:
print(f"{Fore.RED}No se pudo encontrar el enlace a los miembros{Style.RESET_ALL}")
return False
except Exception as e:
print(f"{Fore.RED}Error navegando a la página de miembros: {e}{Style.RESET_ALL}")
return False
def load_cookies(driver):
try:
if not os.path.exists(COOKIES_PATH):
print(f"{Fore.RED}Error: No se encontró el archivo de cookies en {COOKIES_PATH}{Style.RESET_ALL}")
return False
with open(COOKIES_PATH, 'r') as file:
cookies = json.load(file)
driver.get("https://www.facebook.com")
time.sleep(2)
driver.delete_all_cookies()
time.sleep(1)
for cookie in cookies:
if 'name' in cookie and 'value' in cookie:
cookie_dict = {
'name': cookie['name'],
'value': cookie['value'],
'domain': cookie.get('domain', '.facebook.com'),
'path': cookie.get('path', '/')
}
if 'expiry' in cookie:
cookie_dict['expiry'] = cookie['expiry']
if 'secure' in cookie:
cookie_dict['secure'] = cookie['secure']
try:
driver.add_cookie(cookie_dict)
except Exception as e:
print(f"{Fore.YELLOW}Warning al cargar cookie {cookie['name']}: {str(e)}{Style.RESET_ALL}")
driver.get("https://www.facebook.com")
time.sleep(3)
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '[role="navigation"]'))
)
print(f"{Fore.GREEN}✓ Cookies cargadas exitosamente - Sesión activa{Style.RESET_ALL}")
return True
except TimeoutException:
print(f"{Fore.RED}Error: Las cookies se cargaron pero la sesión no está activa{Style.RESET_ALL}")
return False
except Exception as e:
print(f"{Fore.RED}Error cargando cookies: {str(e)}{Style.RESET_ALL}")
return False
def format_elapsed_time(seconds):
"""Formatea el tiempo transcurrido en formato HH:MM:SS"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds = int(seconds % 60)
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
def update_timer():
"""Función para actualizar y mostrar el timer en tiempo real"""
global timer_active, start_time, total_pause_time
while timer_active and start_time is not None:
if not is_paused:
elapsed_time = time.time() - start_time - total_pause_time
formatted_time = format_elapsed_time(elapsed_time)
print(f"\r{Fore.CYAN}⏱️ Tiempo transcurrido: {formatted_time}{Style.RESET_ALL}", end="")
time.sleep(1)
def get_elapsed_time():
"""Calcula el tiempo transcurrido teniendo en cuenta las pausas"""
if start_time is None:
return 0
pause_time = total_pause_time
if is_paused and pause_start_time:
pause_time += time.time() - pause_start_time
return time.time() - start_time - pause_time
def cargar_miembros(driver, group_name, url):
global extraction_active, is_paused, total_members, group_start_time, last_progress_time
ultima_altura = 0
scroll_count = 0
no_new_members_count = 0
previous_count = len(total_members)
group_start_time = time.time()
last_progress_time = time.time()
last_interval_members = set() # Conjunto para almacenar miembros del último intervalo
INITIAL_INTERVAL = 3600 # 1 hora en segundos
SUBSEQUENT_INTERVAL = 600 # 10 minutos en segundos
first_extraction_done = False
actions = ActionChains(driver)
while extraction_active:
try:
while is_paused:
time.sleep(0.1)
continue
while extraction_active and not is_paused:
current_time = time.time()
# Determinar el intervalo actual
current_interval = INITIAL_INTERVAL if not first_extraction_done else SUBSEQUENT_INTERVAL
# Realizar extracción y reporte según el intervalo
if current_time - last_progress_time >= current_interval:
# Extraer miembros actuales
elementos = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/groups/"][href*="/user/"]')
nuevos_perfiles = {elemento.get_attribute('href') for elemento in elementos if elemento.get_attribute('href')}
# Actualizar total_members con los nuevos perfiles
total_members.update(nuevos_perfiles)
# Calcular miembros nuevos en este intervalo
miembros_intervalo = nuevos_perfiles - last_interval_members
intervalo_texto = "primera hora" if not first_extraction_done else "últimos 10 minutos"
print(f"\n{Fore.CYAN}📊 Reporte de {intervalo_texto}:")
print(f"Miembros extraídos en {intervalo_texto}: {len(miembros_intervalo)}")
print(f"Total acumulado: {len(total_members)}")
print(f"Tiempo total transcurrido: {format_elapsed_time(get_elapsed_time())}{Style.RESET_ALL}")
# Actualizar para el siguiente intervalo
last_progress_time = current_time
last_interval_members = nuevos_perfiles.copy()
first_extraction_done = True
# Hacer una breve pausa para la extracción
is_paused = True
time.sleep(2)
is_paused = False
# Lógica normal de scroll
for _ in range(5):
if is_paused or not extraction_active:
break
actions.send_keys(Keys.PAGE_DOWN).perform()
if is_paused or not extraction_active:
break
time.sleep(0.01)
if not is_paused and extraction_active:
actions.send_keys(Keys.END).perform()
if not is_paused and extraction_active:
current_scroll = driver.execute_script('return window.pageYOffset')
viewport_height = driver.execute_script('return window.innerHeight')
driver.execute_script(f'window.scrollTo(0, {current_scroll + viewport_height * 8})')
if is_paused:
continue
# Verificar si hay nuevos miembros
elementos = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/groups/"][href*="/user/"]')
current_profiles = {elemento.get_attribute('href') for elemento in elementos if elemento.get_attribute('href')}
current_count = len(current_profiles)
if current_count <= previous_count:
no_new_members_count += 1
if no_new_members_count >= 5:
group_time = format_elapsed_time(time.time() - group_start_time)
print(f"\n{Fore.YELLOW}No se encontraron nuevos miembros después de 5 intentos."
f"\nTiempo de extracción para este grupo: {group_time}{Style.RESET_ALL}")
extraction_active = False
break
else:
no_new_members_count = 0
previous_count = current_count
total_members.update(current_profiles)
altura_actual = driver.execute_script('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight)')
if altura_actual == ultima_altura:
actions.send_keys(Keys.HOME).perform()
time.sleep(0.5)
ultima_altura = altura_actual
scroll_count += 1
except WebDriverException as e:
if "disconnected" in str(e):
try:
driver.refresh()
time.sleep(2)
navegar_a_miembros(driver, url)
continue
except:
pass
continue
print(f"\n{Fore.YELLOW}Extracción finalizada. Total de perfiles encontrados: {len(total_members)}{Style.RESET_ALL}")
return total_members
def extraer_perfiles_rapido(driver):
global total_members
try:
elementos = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/groups/"][href*="/user/"]')
nuevos_perfiles = {elemento.get_attribute('href') for elemento in elementos if elemento.get_attribute('href')}
total_members.update(nuevos_perfiles)
return total_members
except Exception as e:
print(f"{Fore.RED}Error extrayendo perfiles: {e}{Style.RESET_ALL}")
return total_members
def extraer_perfiles_rapido(driver):
global total_members
try:
elementos = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/groups/"][href*="/user/"]')
nuevos_perfiles = {elemento.get_attribute('href') for elemento in elementos if elemento.get_attribute('href')}
total_members.update(nuevos_perfiles)
return total_members
except Exception as e:
print(f"{Fore.RED}Error extrayendo perfiles: {e}{Style.RESET_ALL}")
return total_members
# Verificar si el archivo existe y generar un nuevo nombre si es necesario
while os.path.exists(archivo_salida):
nombre_sin_extension = nombre_grupo
extension = ".txt"
archivo_salida = f"{nombre_sin_extension} ({contador}){extension}"
contador += 1
try:
with open(archivo_salida, 'w', encoding='utf-8') as archivo:
for perfil in perfiles:
archivo.write(perfil + "\n")
print(f"{Fore.GREEN}Perfiles guardados en: {os.path.abspath(archivo_salida)}{Style.RESET_ALL}")
print(f"{Fore.GREEN}Total de perfiles guardados: {len(perfiles)}{Style.RESET_ALL}")
except Exception as e:
print(f"{Fore.RED}Error guardando archivo: {e}{Style.RESET_ALL}")
def clear_console():
os.system('cls' if os.name == 'nt' else 'clear')
def get_urls_input():
print(f"{Fore.WHITE}Pegue la(s) URL(s) del grupo (una por línea):{Style.RESET_ALL}")
urls = []
try:
while True:
url = input().strip()
if url:
if url.startswith("http"):
urls.append(url)
break
elif not url and urls:
break
except KeyboardInterrupt:
return urls
return urls
def main():
global extraction_active, start_time, total_members, is_paused, timer_active, total_pause_time
clear_console()
print(f"{Fore.CYAN}{'='*50}")
print(f"{Fore.GREEN}Facebook Group Member Extractor{Style.RESET_ALL}")
print(f"{Fore.CYAN}{'='*50}{Style.RESET_ALL}")
start_time = time.time()
total_pause_time = 0
urls = get_urls_input()
if not urls:
print(f"{Fore.RED}No se ingresaron URLs válidas{Style.RESET_ALL}")
return
driver = webdriver.Chrome(service=service, options=chrome_options)
if not load_cookies(driver):
print(f"{Fore.RED}No se pudieron cargar las cookies. El programa se cerrará.{Style.RESET_ALL}")
driver.quit()
return
try:
driver.find_element(By.CSS_SELECTOR, '[role="main"]')
except:
print(f"{Fore.RED}Error: La sesión no está activa. Verifica las cookies.{Style.RESET_ALL}")
driver.quit()
return
# Iniciar el timer y el monitor de teclas en hilos separados
timer_thread = threading.Thread(target=update_timer)
timer_thread.daemon = True
timer_thread.start()
monitor_thread = threading.Thread(target=check_cmd_input)
monitor_thread.daemon = True
monitor_thread.start()
try:
for i, url in enumerate(urls, 1):
if not extraction_active:
break
print(f"\n{Fore.CYAN}Procesando grupo {i}/{len(urls)}: {url}{Style.RESET_ALL}")
total_members.clear()
if navegar_a_miembros(driver, url):
group_name = get_group_name(driver)
if not group_name:
group_name = url.split('/')[-2] if url[-1] == '/' else url.split('/')[-1]
miembros_extraidos = cargar_miembros(driver, group_name, url)
if miembros_extraidos:
guardar_perfiles(miembros_extraidos, group_name)
print(f"{Fore.GREEN}Extracción completada para el grupo: {group_name}{Style.RESET_ALL}")
else:
print(f"{Fore.RED}No se pudieron extraer miembros del grupo{Style.RESET_ALL}")
except Exception as e:
print(f"{Fore.RED}Error: {e}{Style.RESET_ALL}")
finally:
timer_active = False
total_time = format_elapsed_time(get_elapsed_time())
print(f"\n{Fore.GREEN}Tiempo total de ejecución: {total_time}{Style.RESET_ALL}")
if extraction_active:
print(f"\n{Fore.YELLOW}Presiona ENTER para cerrar el navegador y terminar el programa{Style.RESET_ALL}")
while extraction_active:
time.sleep(0.1)
driver.quit()
if name == "main":
try:
main()
except KeyboardInterrupt:
timer_active = False
total_time = format_elapsed_time(get_elapsed_time())
print(f"\n{Fore.YELLOW}Programa terminado por el usuario")
print(f"Tiempo total de ejecución: {total_time}{Style.RESET_ALL}")
except Exception as e:
timer_active = False
total_time = format_elapsed_time(get_elapsed_time())
print(f"\n{Fore.RED}Error inesperado: {e}")
print(f"Tiempo total de ejecución: {total_time}{Style.RESET_ALL}")
finally:
print(f"\n{Fore.GREEN}¡Proceso finalizado!{Style.RESET_ALL}")
How can we reproduce the issue?
.
Relevant log output
.
Operating System
widnows 10
Selenium version
the ultimate update
What are the browser(s) and version(s) where you see this issue?
chrome
What are the browser driver(s) and version(s) where you see this issue?
ultimate version
Are you using Selenium Grid?
No response
The text was updated successfully, but these errors were encountered:
@studiokevinabanto, thank you for creating this issue. We will troubleshoot it as soon as we can.
Info for maintainers
Triage this issue by using labels.
If information is missing, add a helpful comment and then I-issue-template label.
If the issue is a question, add the I-question label.
If the issue is valid but there is no time to troubleshoot it, consider adding the help wanted label.
If the issue requires changes or fixes from an external project (e.g., ChromeDriver, GeckoDriver, MSEdgeDriver, W3C),
add the applicable G-* label, and it will provide the correct link and auto-close the
issue.
After troubleshooting the issue, please add the R-awaiting answer label.
Hi, @studiokevinabanto.
Please follow the issue template, we need more information to reproduce the issue.
Either a complete code snippet and URL/HTML (if more than one file is needed, provide a GitHub repo and instructions to run the code), the specific versions used, or a more detailed description to help us understand the issue.
Note: If you cannot share your code and URL/HTML, any complete code snippet and URL/HTML that reproduces the issue is good enough.
Reply to this issue when all information is provided, thank you.
What happened?
Scraping on Facebook, when a group has more than 16 members, it crashes
CODE:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import time
import threading
import json
import colorama
from colorama import Fore, Style
import win32gui
import win32con
import msvcrt
from selenium.webdriver.common.action_chains import ActionChains
from datetime import datetime
Inicializar colorama
colorama.init()
Variables globales
start_time = None
total_members = set()
should_continue = True
extraction_active = True
is_paused = False
timer_active = True
group_start_time = None
pause_start_time = None
total_pause_time = 0
last_progress_time = None
PROGRESS_INTERVAL = 3600 # 1 hora en segundos
Ruta del archivo de cookies
COOKIES_PATH = r"C:\selenium\0_extraer_miembros_de_grupos\facebook_cookies.json"
Configuración de Chrome
chrome_options = Options()
chrome_options.add_argument("--headless=new") # Usar el nuevo modo headless
chrome_options.add_argument("--window-size=1920,1080") # Establecer resolución
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-software-rasterizer")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_experimental_option("detach", True)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.page_load_strategy = 'none'
driver_path = r"C:\chromedriver\chromedriver.exe"
service = Service(driver_path)
def is_cmd_window_active():
"""Verifica si la ventana de CMD está activa"""
try:
return win32gui.GetForegroundWindow() == win32gui.GetConsoleWindow()
except:
return True
def check_cmd_input():
"""Monitorea las teclas solo cuando CMD está activa"""
global extraction_active, is_paused, pause_start_time, total_pause_time
def navegar_a_miembros(driver, url):
try:
driver.get(url)
time.sleep(3) # Esperar a que cargue la página inicial del grupo
def load_cookies(driver):
try:
if not os.path.exists(COOKIES_PATH):
print(f"{Fore.RED}Error: No se encontró el archivo de cookies en {COOKIES_PATH}{Style.RESET_ALL}")
return False
def format_elapsed_time(seconds):
"""Formatea el tiempo transcurrido en formato HH:MM:SS"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds = int(seconds % 60)
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
def update_timer():
"""Función para actualizar y mostrar el timer en tiempo real"""
global timer_active, start_time, total_pause_time
def get_elapsed_time():
"""Calcula el tiempo transcurrido teniendo en cuenta las pausas"""
if start_time is None:
return 0
pause_time = total_pause_time
if is_paused and pause_start_time:
pause_time += time.time() - pause_start_time
return time.time() - start_time - pause_time
def scroll_page_extreme(driver):
try:
actions = ActionChains(driver)
def cargar_miembros(driver, group_name, url):
global extraction_active, is_paused, total_members, group_start_time, last_progress_time
ultima_altura = 0
scroll_count = 0
no_new_members_count = 0
previous_count = len(total_members)
group_start_time = time.time()
last_progress_time = time.time()
last_interval_members = set() # Conjunto para almacenar miembros del último intervalo
INITIAL_INTERVAL = 3600 # 1 hora en segundos
SUBSEQUENT_INTERVAL = 600 # 10 minutos en segundos
first_extraction_done = False
def extraer_perfiles_rapido(driver):
global total_members
try:
elementos = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/groups/"][href*="/user/"]')
nuevos_perfiles = {elemento.get_attribute('href') for elemento in elementos if elemento.get_attribute('href')}
total_members.update(nuevos_perfiles)
return total_members
except Exception as e:
print(f"{Fore.RED}Error extrayendo perfiles: {e}{Style.RESET_ALL}")
return total_members
def extraer_perfiles_rapido(driver):
global total_members
try:
elementos = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/groups/"][href*="/user/"]')
nuevos_perfiles = {elemento.get_attribute('href') for elemento in elementos if elemento.get_attribute('href')}
total_members.update(nuevos_perfiles)
return total_members
except Exception as e:
print(f"{Fore.RED}Error extrayendo perfiles: {e}{Style.RESET_ALL}")
return total_members
def guardar_perfiles(perfiles, nombre_grupo):
if perfiles:
base_nombre = f"{nombre_grupo}.txt"
archivo_salida = base_nombre
contador = 1
def clear_console():
os.system('cls' if os.name == 'nt' else 'clear')
def get_group_name(driver):
try:
return driver.title.split('|')[0].strip()
except:
return None
def get_urls_input():
print(f"{Fore.WHITE}Pegue la(s) URL(s) del grupo (una por línea):{Style.RESET_ALL}")
urls = []
try:
while True:
url = input().strip()
if url:
if url.startswith("http"):
urls.append(url)
break
elif not url and urls:
break
except KeyboardInterrupt:
return urls
def main():
global extraction_active, start_time, total_members, is_paused, timer_active, total_pause_time
clear_console()
print(f"{Fore.CYAN}{'='*50}")
print(f"{Fore.GREEN}Facebook Group Member Extractor{Style.RESET_ALL}")
print(f"{Fore.CYAN}{'='*50}{Style.RESET_ALL}")
if name == "main":
try:
main()
except KeyboardInterrupt:
timer_active = False
total_time = format_elapsed_time(get_elapsed_time())
print(f"\n{Fore.YELLOW}Programa terminado por el usuario")
print(f"Tiempo total de ejecución: {total_time}{Style.RESET_ALL}")
except Exception as e:
timer_active = False
total_time = format_elapsed_time(get_elapsed_time())
print(f"\n{Fore.RED}Error inesperado: {e}")
print(f"Tiempo total de ejecución: {total_time}{Style.RESET_ALL}")
finally:
print(f"\n{Fore.GREEN}¡Proceso finalizado!{Style.RESET_ALL}")
How can we reproduce the issue?
.
Relevant log output
.
Operating System
widnows 10
Selenium version
the ultimate update
What are the browser(s) and version(s) where you see this issue?
chrome
What are the browser driver(s) and version(s) where you see this issue?
ultimate version
Are you using Selenium Grid?
No response
The text was updated successfully, but these errors were encountered: