import requests from bs4 import BeautifulSoup import os import re import json import time from urllib.parse import urljoin from datetime import datetime class RomhackRaceScraper: def __init__(self): self.base_url = "https://www.romhackraces.com/levels.php" self.session = requests.Session() self.rate_limit = 1 self.last_request = 0 self.download_history_file = "download_history.json" self.download_history = self.load_download_history() self.debug = True os.makedirs('patches', exist_ok=True) def debug_print(self, message): if self.debug: print(f"DEBUG: {message}") def load_download_history(self): try: with open(self.download_history_file, 'r') as f: return json.load(f) except FileNotFoundError: return { "last_update": "", "downloaded_patches": {}, "last_season_checked": 0 } def save_download_history(self): with open(self.download_history_file, 'w') as f: json.dump(self.download_history, f, indent=2) def rate_limited_request(self, url): self.debug_print(f"Making request to: {url}") current_time = time.time() time_since_last = current_time - self.last_request if time_since_last < self.rate_limit: time.sleep(self.rate_limit - time_since_last) response = self.session.get(url) self.last_request = time.time() self.debug_print(f"Response status code: {response.status_code}") return response def get_week_number(self, element): """Extract week number from span element containing number images""" # First find the span with font-size:18px that contains 'Week' week_span = element.find('span', style='font-size:18px;') if not week_span or 'Week' not in week_span.text: return None # Get all number images in this span number_images = week_span.find_all('img') if not number_images: self.debug_print("Found week span but no number images") return None try: # Extract numbers from image filenames numbers = [img['src'].split('/')[-1].split('.')[0] for img in number_images] week_num = int(''.join(numbers)) self.debug_print(f"Found week number: {week_num}") return week_num except (ValueError, KeyError, IndexError) as e: self.debug_print(f"Error parsing week number: {e}") return None def download_patch(self, url, week_number, season_number): patch_id = f"s{season_number}_w{week_number}" if patch_id in self.download_history["downloaded_patches"]: self.debug_print(f"Patch {patch_id} already downloaded") return False response = self.rate_limited_request(url) if response.status_code == 200: season_dir = os.path.join('patches', f"Season{season_number}") os.makedirs(season_dir, exist_ok=True) filename = f"Week{week_number}.bps" filepath = os.path.join(season_dir, filename) with open(filepath, 'wb') as f: f.write(response.content) self.download_history["downloaded_patches"][patch_id] = { "filename": filepath, "downloaded_at": datetime.now().isoformat(), "url": url } self.save_download_history() print(f"Downloaded Season {season_number} Week {week_number}") return True else: print(f"Failed to download Season {season_number} Week {week_number} - Status code: {response.status_code}") return False def get_seasons(self): response = self.rate_limited_request(self.base_url) soup = BeautifulSoup(response.text, 'html.parser') season_div = soup.find('div', class_='info leaders', style=lambda s: s and '300px' in s) if not season_div: self.debug_print("Could not find season navigation div") return [] season_links = season_div.find_all('a') seasons = [] for link in season_links: season_num = re.search(r'season=(\d+)', link['href']) if season_num: seasons.append(int(season_num.group(1))) if 1 not in seasons: seasons.append(1) self.debug_print(f"Found seasons: {seasons}") return sorted(seasons) def test_parse(self): """Test parsing on the first page""" response = self.rate_limited_request(self.base_url) soup = BeautifulSoup(response.text, 'html.parser') # Find first patch link first_patch = soup.find('a', href=lambda href: href and href.endswith('.bps')) if first_patch: self.debug_print(f"Test found patch link: {first_patch['href']}") else: self.debug_print("Test could not find any patch links") # Find first week span first_week = soup.find('span', style='font-size:18px;') if first_week: self.debug_print(f"Test found week span: {first_week.text}") number_images = first_week.find_all('img') self.debug_print(f"Number images found: {len(number_images)}") for img in number_images: self.debug_print(f"Image source: {img['src']}") else: self.debug_print("Test could not find any week spans") def scrape_season(self, season_number): url = f"{self.base_url}?season={season_number}" print(f"\nScraping Season {season_number}") response = self.rate_limited_request(url) soup = BeautifulSoup(response.text, 'html.parser') downloads_this_season = 0 # Find all info divs info_divs = soup.find_all('div', class_='info') for info_div in info_divs: # Check if this div contains a week number week_num = self.get_week_number(info_div) if week_num is None: continue self.debug_print(f"Processing Week {week_num}") # Look for the patch link in the next table cell table_cell = info_div.find_next('td', valign='top', align='right') if table_cell: patch_link = table_cell.find('a', href=lambda href: href and href.endswith('.bps')) if patch_link: self.debug_print(f"Found patch link: {patch_link['href']}") patch_url = urljoin("https://www.romhackraces.com/", patch_link['href']) self.debug_print(f"Full patch URL: {patch_url}") if self.download_patch(patch_url, week_num, season_number): downloads_this_season += 1 else: self.debug_print(f"No patch link found for Week {week_num}") else: self.debug_print(f"No table cell found for Week {week_num}") self.debug_print(f"Downloads this season: {downloads_this_season}") return downloads_this_season def scrape_all_seasons(self): self.test_parse() seasons = self.get_seasons() print(f"Found {len(seasons)} seasons to scrape") total_downloads = 0 last_season_checked = self.download_history["last_season_checked"] for season in seasons: if season < last_season_checked: self.debug_print(f"Skipping Season {season} - already checked") continue downloads = self.scrape_season(season) total_downloads += downloads self.download_history["last_season_checked"] = max( season, self.download_history["last_season_checked"] ) self.download_history["last_update"] = datetime.now().isoformat() self.save_download_history() print(f"\nDownload session complete. Downloaded {total_downloads} new patches.") def main(): scraper = RomhackRaceScraper() # Check if we have existing downloads if os.path.exists("download_history.json"): print("Found existing download history") print(f"Last update: {scraper.download_history['last_update']}") print(f"Previously downloaded patches: {len(scraper.download_history['downloaded_patches'])}") print("Checking for new patches...\n") else: print("No download history found. Will download all patches.\n") scraper.scrape_all_seasons() if __name__ == '__main__': main()