`downloader.py` can now be called with `-d` or `--debug` to output extra information. it currently defaults to false. closes issue #4
262 lines
10 KiB
Python
262 lines
10 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
import re
|
|
import json
|
|
import time
|
|
from urllib.parse import urljoin
|
|
from datetime import datetime
|
|
import argparse
|
|
|
|
class RomhackRaceScraper:
|
|
def __init__(self):
|
|
self.base_url = "https://www.romhackraces.com/levels.php"
|
|
self.session = requests.Session()
|
|
self.rate_limit = 1
|
|
self.last_request = 0
|
|
self.download_history_file = "download_history.json"
|
|
self.download_history = self.load_download_history()
|
|
self.debug = False # Default to False
|
|
|
|
os.makedirs('patches', exist_ok=True)
|
|
|
|
def debug_print(self, message):
|
|
if self.debug:
|
|
print(f"DEBUG: {message}")
|
|
|
|
def load_download_history(self):
|
|
try:
|
|
with open(self.download_history_file, 'r') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
return {
|
|
"last_update": "",
|
|
"downloaded_patches": {},
|
|
"last_season_checked": 0
|
|
}
|
|
|
|
def save_download_history(self):
|
|
with open(self.download_history_file, 'w') as f:
|
|
json.dump(self.download_history, f, indent=2)
|
|
|
|
def rate_limited_request(self, url):
|
|
self.debug_print(f"Making request to: {url}")
|
|
current_time = time.time()
|
|
time_since_last = current_time - self.last_request
|
|
if time_since_last < self.rate_limit:
|
|
time.sleep(self.rate_limit - time_since_last)
|
|
|
|
response = self.session.get(url)
|
|
self.last_request = time.time()
|
|
self.debug_print(f"Response status code: {response.status_code}")
|
|
return response
|
|
|
|
def get_week_number(self, element):
|
|
"""Extract week number from span element containing number images"""
|
|
# First find the span with font-size:18px that contains 'Week'
|
|
week_span = element.find('span', style='font-size:18px;')
|
|
if not week_span or 'Week' not in week_span.text:
|
|
return None
|
|
|
|
# Get all number images in this span
|
|
number_images = week_span.find_all('img')
|
|
if not number_images:
|
|
self.debug_print("Found week span but no number images")
|
|
return None
|
|
|
|
try:
|
|
# Extract numbers from image filenames
|
|
numbers = [img['src'].split('/')[-1].split('.')[0] for img in number_images]
|
|
week_num = int(''.join(numbers))
|
|
self.debug_print(f"Found week number: {week_num}")
|
|
return week_num
|
|
except (ValueError, KeyError, IndexError) as e:
|
|
self.debug_print(f"Error parsing week number: {e}")
|
|
return None
|
|
|
|
def download_patch(self, url, week_number, season_number):
|
|
patch_id = f"s{season_number}_w{week_number}"
|
|
|
|
if patch_id in self.download_history["downloaded_patches"]:
|
|
self.debug_print(f"Patch {patch_id} already downloaded")
|
|
return False
|
|
|
|
response = self.rate_limited_request(url)
|
|
if response.status_code == 200:
|
|
season_dir = os.path.join('patches', f"Season{season_number}")
|
|
os.makedirs(season_dir, exist_ok=True)
|
|
|
|
filename = f"Week{week_number}.bps"
|
|
filepath = os.path.join(season_dir, filename)
|
|
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.download_history["downloaded_patches"][patch_id] = {
|
|
"filename": filepath,
|
|
"downloaded_at": datetime.now().isoformat(),
|
|
"url": url
|
|
}
|
|
self.save_download_history()
|
|
|
|
print(f"Downloaded Season {season_number} Week {week_number}")
|
|
return True
|
|
else:
|
|
print(f"Failed to download Season {season_number} Week {week_number} - Status code: {response.status_code}")
|
|
return False
|
|
|
|
def get_seasons(self):
|
|
response = self.rate_limited_request(self.base_url)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
seasons = set() # use a set to avoid duplicates
|
|
|
|
# Find the season navigation section - looking for text that starts with "Season"
|
|
season_text = soup.find(string=lambda t: t and t.strip().startswith('Season'))
|
|
if season_text:
|
|
parent = season_text.parent
|
|
|
|
# Get all links and the bold span that follow "Season"
|
|
elements = parent.find_all(['a', 'span'])
|
|
for element in elements:
|
|
if element.name == 'a':
|
|
try:
|
|
seasons.add(int(element.text))
|
|
except ValueError:
|
|
continue
|
|
elif element.name == 'span' and element.get('style') == 'font-weight:bold;':
|
|
try:
|
|
seasons.add(int(element.text))
|
|
except ValueError:
|
|
continue
|
|
|
|
if not seasons:
|
|
self.debug_print("Warning: No seasons found in normal parsing")
|
|
# Fallback: try to find any numbers in season links
|
|
season_links = soup.find_all('a', href=lambda h: h and 'season=' in h)
|
|
for link in season_links:
|
|
match = re.search(r'season=(\d+)', link['href'])
|
|
if match:
|
|
seasons.add(int(match.group(1)))
|
|
|
|
# Always include season 1
|
|
seasons.add(1)
|
|
|
|
sorted_seasons = sorted(list(seasons))
|
|
self.debug_print(f"Found seasons: {sorted_seasons}")
|
|
|
|
if len(sorted_seasons) < 2:
|
|
self.debug_print("Warning: Found unusually few seasons, might indicate parsing error")
|
|
|
|
return sorted_seasons
|
|
|
|
def test_parse(self):
|
|
"""Test parsing on the first page"""
|
|
response = self.rate_limited_request(self.base_url)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Find first patch link
|
|
first_patch = soup.find('a', href=lambda href: href and href.endswith('.bps'))
|
|
if first_patch:
|
|
self.debug_print(f"Test found patch link: {first_patch['href']}")
|
|
else:
|
|
self.debug_print("Test could not find any patch links")
|
|
|
|
# Find first week span
|
|
first_week = soup.find('span', style='font-size:18px;')
|
|
if first_week:
|
|
self.debug_print(f"Test found week span: {first_week.text}")
|
|
number_images = first_week.find_all('img')
|
|
self.debug_print(f"Number images found: {len(number_images)}")
|
|
for img in number_images:
|
|
self.debug_print(f"Image source: {img['src']}")
|
|
else:
|
|
self.debug_print("Test could not find any week spans")
|
|
|
|
def scrape_season(self, season_number):
|
|
url = f"{self.base_url}?season={season_number}"
|
|
print(f"\nScraping Season {season_number}")
|
|
|
|
response = self.rate_limited_request(url)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
downloads_this_season = 0
|
|
|
|
# Find all info divs
|
|
info_divs = soup.find_all('div', class_='info')
|
|
|
|
for info_div in info_divs:
|
|
# Check if this div contains a week number
|
|
week_num = self.get_week_number(info_div)
|
|
if week_num is None:
|
|
continue
|
|
|
|
self.debug_print(f"Processing Week {week_num}")
|
|
|
|
# Look for the patch link in the next table cell
|
|
table_cell = info_div.find_next('td', valign='top', align='right')
|
|
if table_cell:
|
|
patch_link = table_cell.find('a', href=lambda href: href and href.endswith('.bps'))
|
|
if patch_link:
|
|
self.debug_print(f"Found patch link: {patch_link['href']}")
|
|
patch_url = urljoin("https://www.romhackraces.com/", patch_link['href'])
|
|
self.debug_print(f"Full patch URL: {patch_url}")
|
|
|
|
if self.download_patch(patch_url, week_num, season_number):
|
|
downloads_this_season += 1
|
|
else:
|
|
self.debug_print(f"No patch link found for Week {week_num}")
|
|
else:
|
|
self.debug_print(f"No table cell found for Week {week_num}")
|
|
|
|
self.debug_print(f"Downloads this season: {downloads_this_season}")
|
|
return downloads_this_season
|
|
|
|
def scrape_all_seasons(self):
|
|
self.test_parse()
|
|
seasons = self.get_seasons()
|
|
print(f"Found {len(seasons)} seasons to scrape")
|
|
|
|
total_downloads = 0
|
|
last_season_checked = self.download_history["last_season_checked"]
|
|
|
|
for season in seasons:
|
|
if season < last_season_checked:
|
|
self.debug_print(f"Skipping Season {season} - already checked")
|
|
continue
|
|
|
|
downloads = self.scrape_season(season)
|
|
total_downloads += downloads
|
|
|
|
self.download_history["last_season_checked"] = max(
|
|
season,
|
|
self.download_history["last_season_checked"]
|
|
)
|
|
self.download_history["last_update"] = datetime.now().isoformat()
|
|
self.save_download_history()
|
|
|
|
print(f"\nDownload session complete. Downloaded {total_downloads} new patches.")
|
|
|
|
def main():
|
|
# Parse command line arguments
|
|
parser = argparse.ArgumentParser(description='Romhack Race Scraper')
|
|
parser.add_argument('-d', '--debug', action='store_true', help='Enable debug output')
|
|
args = parser.parse_args()
|
|
|
|
scraper = RomhackRaceScraper()
|
|
scraper.debug = args.debug # Set debug flag based on command line argument
|
|
|
|
# Check if we have existing downloads
|
|
if os.path.exists("download_history.json"):
|
|
print("Found existing download history")
|
|
print(f"Last update: {scraper.download_history['last_update']}")
|
|
print(f"Previously downloaded patches: {len(scraper.download_history['downloaded_patches'])}")
|
|
print("Checking for new patches...\n")
|
|
else:
|
|
print("No download history found. Will download all patches.\n")
|
|
|
|
scraper.scrape_all_seasons()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|