-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl_heroes.py
52 lines (39 loc) · 1.77 KB
/
crawl_heroes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
from bs4 import BeautifulSoup
import json
def crawl_paginated_landing_page(base_url):
current_url = base_url
all_cards = []
while current_url:
print(f"Fetching URL: {current_url}")
# Fetch the page
response = requests.get(current_url)
# Check if the request was successful
if response.status_code != 200:
print(f"Failed to fetch the page. Status code: {response.status_code}")
break
soup = BeautifulSoup(response.text, 'html.parser')
# Print the first 500 characters of the HTML to check what we're getting
print("First 500 characters of HTML:")
print(soup.prettify()[:500])
# Find all elements with class="m-card-container"
cards = soup.find_all(class_="m-card-container")
print(f"Number of cards found: {len(cards)}")
if not cards:
# If no cards found, let's try a more general search
all_divs = soup.find_all('div')
print(f"Total number of div elements: {len(all_divs)}")
for div in all_divs[:10]: # Print classes of first 10 divs
print(f"Div classes: {div.get('class', 'No class')}")
# ... (rest of the card processing code remains the same)
# Find the element with aria-label="Next Page"
next_link = soup.find(attrs={"aria-label": "Next Page"})
if next_link and 'href' in next_link.attrs:
current_url = base_url + next_link['href']
else:
current_url = None
# Convert the list of card data to JSON and print
print(json.dumps(all_cards, indent=2))
# Usage
base_url = "https://aws.amazon.com/developer/community/heroes/"
crawl_paginated_landing_page(base_url)