rhrtools/downloader.py
2025-07-26 12:18:50 -04:00

256 lines
9.8 KiB
Python

import requests
from bs4 import BeautifulSoup
import os
import re
import json
import time
from urllib.parse import urljoin
from datetime import datetime
class RomhackRaceScraper:
def __init__(self):
self.base_url = "https://www.romhackraces.com/levels.php"
self.session = requests.Session()
self.rate_limit = 1
self.last_request = 0
self.download_history_file = "download_history.json"
self.download_history = self.load_download_history()
self.debug = True
os.makedirs('patches', exist_ok=True)
def debug_print(self, message):
if self.debug:
print(f"DEBUG: {message}")
def load_download_history(self):
try:
with open(self.download_history_file, 'r') as f:
return json.load(f)
except FileNotFoundError:
return {
"last_update": "",
"downloaded_patches": {},
"last_season_checked": 0
}
def save_download_history(self):
with open(self.download_history_file, 'w') as f:
json.dump(self.download_history, f, indent=2)
def rate_limited_request(self, url):
self.debug_print(f"Making request to: {url}")
current_time = time.time()
time_since_last = current_time - self.last_request
if time_since_last < self.rate_limit:
time.sleep(self.rate_limit - time_since_last)
response = self.session.get(url)
self.last_request = time.time()
self.debug_print(f"Response status code: {response.status_code}")
return response
def get_week_number(self, element):
"""Extract week number from span element containing number images"""
# First find the span with font-size:18px that contains 'Week'
week_span = element.find('span', style='font-size:18px;')
if not week_span or 'Week' not in week_span.text:
return None
# Get all number images in this span
number_images = week_span.find_all('img')
if not number_images:
self.debug_print("Found week span but no number images")
return None
try:
# Extract numbers from image filenames
numbers = [img['src'].split('/')[-1].split('.')[0] for img in number_images]
week_num = int(''.join(numbers))
self.debug_print(f"Found week number: {week_num}")
return week_num
except (ValueError, KeyError, IndexError) as e:
self.debug_print(f"Error parsing week number: {e}")
return None
def download_patch(self, url, week_number, season_number):
patch_id = f"s{season_number}_w{week_number}"
if patch_id in self.download_history["downloaded_patches"]:
self.debug_print(f"Patch {patch_id} already downloaded")
return False
response = self.rate_limited_request(url)
if response.status_code == 200:
season_dir = os.path.join('patches', f"Season{season_number}")
os.makedirs(season_dir, exist_ok=True)
filename = f"Week{week_number}.bps"
filepath = os.path.join(season_dir, filename)
with open(filepath, 'wb') as f:
f.write(response.content)
self.download_history["downloaded_patches"][patch_id] = {
"filename": filepath,
"downloaded_at": datetime.now().isoformat(),
"url": url
}
self.save_download_history()
print(f"Downloaded Season {season_number} Week {week_number}")
return True
else:
print(f"Failed to download Season {season_number} Week {week_number} - Status code: {response.status_code}")
return False
def get_seasons(self):
response = self.rate_limited_request(self.base_url)
soup = BeautifulSoup(response.text, 'html.parser')
seasons = set() # use a set to avoid duplicates
# Find the season navigation section - looking for text that starts with "Season"
season_text = soup.find(text=lambda t: t and t.strip().startswith('Season'))
if season_text:
parent = season_text.parent
# Get all links and the bold span that follow "Season"
elements = parent.find_all(['a', 'span'])
for element in elements:
if element.name == 'a':
try:
seasons.add(int(element.text))
except ValueError:
continue
elif element.name == 'span' and element.get('style') == 'font-weight:bold;':
try:
seasons.add(int(element.text))
except ValueError:
continue
if not seasons:
self.debug_print("Warning: No seasons found in normal parsing")
# Fallback: try to find any numbers in season links
season_links = soup.find_all('a', href=lambda h: h and 'season=' in h)
for link in season_links:
match = re.search(r'season=(\d+)', link['href'])
if match:
seasons.add(int(match.group(1)))
# Always include season 1
seasons.add(1)
sorted_seasons = sorted(list(seasons))
self.debug_print(f"Found seasons: {sorted_seasons}")
if len(sorted_seasons) < 2:
self.debug_print("Warning: Found unusually few seasons, might indicate parsing error")
return sorted_seasons
def test_parse(self):
"""Test parsing on the first page"""
response = self.rate_limited_request(self.base_url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find first patch link
first_patch = soup.find('a', href=lambda href: href and href.endswith('.bps'))
if first_patch:
self.debug_print(f"Test found patch link: {first_patch['href']}")
else:
self.debug_print("Test could not find any patch links")
# Find first week span
first_week = soup.find('span', style='font-size:18px;')
if first_week:
self.debug_print(f"Test found week span: {first_week.text}")
number_images = first_week.find_all('img')
self.debug_print(f"Number images found: {len(number_images)}")
for img in number_images:
self.debug_print(f"Image source: {img['src']}")
else:
self.debug_print("Test could not find any week spans")
def scrape_season(self, season_number):
url = f"{self.base_url}?season={season_number}"
print(f"\nScraping Season {season_number}")
response = self.rate_limited_request(url)
soup = BeautifulSoup(response.text, 'html.parser')
downloads_this_season = 0
# Find all info divs
info_divs = soup.find_all('div', class_='info')
for info_div in info_divs:
# Check if this div contains a week number
week_num = self.get_week_number(info_div)
if week_num is None:
continue
self.debug_print(f"Processing Week {week_num}")
# Look for the patch link in the next table cell
table_cell = info_div.find_next('td', valign='top', align='right')
if table_cell:
patch_link = table_cell.find('a', href=lambda href: href and href.endswith('.bps'))
if patch_link:
self.debug_print(f"Found patch link: {patch_link['href']}")
patch_url = urljoin("https://www.romhackraces.com/", patch_link['href'])
self.debug_print(f"Full patch URL: {patch_url}")
if self.download_patch(patch_url, week_num, season_number):
downloads_this_season += 1
else:
self.debug_print(f"No patch link found for Week {week_num}")
else:
self.debug_print(f"No table cell found for Week {week_num}")
self.debug_print(f"Downloads this season: {downloads_this_season}")
return downloads_this_season
def scrape_all_seasons(self):
self.test_parse()
seasons = self.get_seasons()
print(f"Found {len(seasons)} seasons to scrape")
total_downloads = 0
last_season_checked = self.download_history["last_season_checked"]
for season in seasons:
if season < last_season_checked:
self.debug_print(f"Skipping Season {season} - already checked")
continue
downloads = self.scrape_season(season)
total_downloads += downloads
self.download_history["last_season_checked"] = max(
season,
self.download_history["last_season_checked"]
)
self.download_history["last_update"] = datetime.now().isoformat()
self.save_download_history()
print(f"\nDownload session complete. Downloaded {total_downloads} new patches.")
def main():
scraper = RomhackRaceScraper()
# Check if we have existing downloads
if os.path.exists("download_history.json"):
print("Found existing download history")
print(f"Last update: {scraper.download_history['last_update']}")
print(f"Previously downloaded patches: {len(scraper.download_history['downloaded_patches'])}")
print("Checking for new patches...\n")
else:
print("No download history found. Will download all patches.\n")
scraper.scrape_all_seasons()
if __name__ == '__main__':
main()