-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathselenuim_scrape-old.py
163 lines (130 loc) · 6.14 KB
/
selenuim_scrape-old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from selenium_scraper import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time, datetime
import json
import requests
import os
# Configure Chrome options
chrome_options = Options()
#chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration
chrome_options.add_argument("--disable-extensions") # Disable extensions
chrome_options.add_argument("--disable-blink-features=AutomationControlled") # Disable automation detection
chrome_options.add_argument("--incognito") # Open in incognito mode
chrome_options.add_argument("--disable-popup-blocking") # Disable popup blocking
chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems
chrome_options.add_argument("--no-sandbox") # Bypass OS security model
# Path to ChromeDriver
chrome_driver_path = "./chromedriver-linux64/chromedriver"
# Set up ChromeDriver service
service = ChromeService(executable_path="./chromedriver-linux64/chromedriver")
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
# Function to simulate human-like interactions
def simulate_human_interaction(driver):
actions = ActionChains(driver)
actions.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(2)
actions.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(2)
# URL of the webpage to scrape
# url = "https://www.anandabazar.com//rabibashoriyo/"
url = "https://www.anandabazar.com/rabibashoriyo/bengali-short-story-authored-by-santanu-dey/cid/1529147"
def write_metadata(object):
# Create a datetime object for the current date and time
now = datetime.datetime.now()
# Convert the datetime object to a string with the desired format
date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
date_str = now.strftime("%d-%m-%Y")
metadata = {}
metadata['url'] = object["url"]
metadata['author'] = object["author"]
metadata['crawl_date'] = date_time_str
output_file_path = f'./metadata/rabibasariya/{object["name"]}-{object["author"]}.json'
output_file_path = output_file_path.replace(' ', '-')
with open(output_file_path, 'w', encoding='utf-8') as outfile:
json.dump(metadata, outfile, ensure_ascii=False)
def write_story(object):
print(f'{object["name"]}: Writing story')
# download image
now = datetime.datetime.now()
date_str = now.strftime("%d-%m-%Y")
image_outfile = f"metadata/images/rabibasariya/{object['name']}-{object['author']}.jpg"
image_outfile = image_outfile.replace(' ', '-')
response = requests.get(object["image"])
if response.status_code == 200:
# Open a file in binary mode and write the response content to it
with open(image_outfile, 'wb') as f:
f.write(response.content)
print(f'{object["name"]}: Image downloaded')
else:
print('Failed to download image')
image_section = f'<div align=center> <img src="../../{image_outfile}" align="center"></div>'
name_section = f'<h1 align=center>{object["name"]}</h1>'
author_section = f'<h2 align=center>{object["author"]}</h2>'
story_section = object["text"]
# print(type(story_section))
# story_section = "\n".join(story_section)
# # print(story_section)
markdown_content = f'{image_section}<br>{name_section}\n{author_section}<br>\n\n{story_section}'
markdown_outfile = f'./stories/rabibasariya/{object["name"]}-{object["author"]}.md'
markdown_outfile = markdown_outfile.replace(" ", '-')
# if not already scraped
if not os.path.exists(markdown_outfile):
with open(markdown_outfile, 'w') as f:
f.write(markdown_content)
## append to README
with open('rabibasariya', "a") as f:
f.write(f"\n[ {object['name']} - {object['author']} ]({markdown_outfile})")
print(f'{object["name"]}: Wrinting story')
def fetch_a_story(driver,url):
try:
# Open the webpage
driver.get(url)
# Wait for the page to load and the necessary element to be present
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# Simulate human interactions
simulate_human_interaction(driver)
# Locate the div with the specific class
leadimgbox_div = driver.find_element(By.XPATH, "//div[@class='leadimgbox mt-24']")
# Locate the img element within the div
img_element = leadimgbox_div.find_element(By.XPATH, ".//img[@fetchpriority='high']")
# Get the src attribute of the img element
img_src = img_element.get_attribute('src')
print(img_src)
# name
content_div = driver.find_element(By.XPATH, "//div[@class='articletbox mt-32']")
h1_element = content_div.find_element(By.XPATH, ".//h1[@class='mt-8']")
print(f"name: {h1_element.text}")
# editor
editor_div = driver.find_element(By.XPATH, "//div[@class='editbox df']")
h5_element = editor_div.find_element(By.XPATH, ".//h5[@class='betellips betvl-1']")
print(f"author: {h5_element.text}")
# story
story_div = driver.find_element(By.XPATH, "//div[@class='contentbox' and @id='contentbox']")
# Locate all p elements within the div
p_elements = story_div.find_elements(By.XPATH, ".//p")
# Get the text of each p element
p_texts = ""
for p in p_elements:
p_texts += p.text + "\n\n"
print(type(p_texts))
# write metadata
obj = {}
obj["url"] = url.strip()
obj["image"] = img_src.strip()
obj["name"] = h1_element.text.strip()
obj["author"] = h5_element.text.strip()
obj["text"] = p_texts
write_metadata(obj)
write_story(obj)
finally:
# Quit the WebDriver
driver.quit()
fetch_a_story(driver,url)