rhrtools/downloader.py
agatha 1c520a7f58 feat: add debug flag to downloader
`downloader.py` can now be called with `-d` or `--debug` to output extra information.
it currently defaults to false.

closes issue #4
2025-08-03 12:23:15 -04:00

262 lines
10 KiB
Python

import requests
from bs4 import BeautifulSoup
import os
import re
import json
import time
from urllib.parse import urljoin
from datetime import datetime
import argparse
class RomhackRaceScraper:
def __init__(self):
self.base_url = "https://www.romhackraces.com/levels.php"
self.session = requests.Session()
self.rate_limit = 1
self.last_request = 0
self.download_history_file = "download_history.json"
self.download_history = self.load_download_history()
self.debug = False # Default to False
os.makedirs('patches', exist_ok=True)
def debug_print(self, message):
if self.debug:
print(f"DEBUG: {message}")
def load_download_history(self):
try:
with open(self.download_history_file, 'r') as f:
return json.load(f)
except FileNotFoundError:
return {
"last_update": "",
"downloaded_patches": {},
"last_season_checked": 0
}
def save_download_history(self):
with open(self.download_history_file, 'w') as f:
json.dump(self.download_history, f, indent=2)
def rate_limited_request(self, url):
self.debug_print(f"Making request to: {url}")
current_time = time.time()
time_since_last = current_time - self.last_request
if time_since_last < self.rate_limit:
time.sleep(self.rate_limit - time_since_last)
response = self.session.get(url)
self.last_request = time.time()
self.debug_print(f"Response status code: {response.status_code}")
return response
def get_week_number(self, element):
"""Extract week number from span element containing number images"""
# First find the span with font-size:18px that contains 'Week'
week_span = element.find('span', style='font-size:18px;')
if not week_span or 'Week' not in week_span.text:
return None
# Get all number images in this span
number_images = week_span.find_all('img')
if not number_images:
self.debug_print("Found week span but no number images")
return None
try:
# Extract numbers from image filenames
numbers = [img['src'].split('/')[-1].split('.')[0] for img in number_images]
week_num = int(''.join(numbers))
self.debug_print(f"Found week number: {week_num}")
return week_num
except (ValueError, KeyError, IndexError) as e:
self.debug_print(f"Error parsing week number: {e}")
return None
def download_patch(self, url, week_number, season_number):
patch_id = f"s{season_number}_w{week_number}"
if patch_id in self.download_history["downloaded_patches"]:
self.debug_print(f"Patch {patch_id} already downloaded")
return False
response = self.rate_limited_request(url)
if response.status_code == 200:
season_dir = os.path.join('patches', f"Season{season_number}")
os.makedirs(season_dir, exist_ok=True)
filename = f"Week{week_number}.bps"
filepath = os.path.join(season_dir, filename)
with open(filepath, 'wb') as f:
f.write(response.content)
self.download_history["downloaded_patches"][patch_id] = {
"filename": filepath,
"downloaded_at": datetime.now().isoformat(),
"url": url
}
self.save_download_history()
print(f"Downloaded Season {season_number} Week {week_number}")
return True
else:
print(f"Failed to download Season {season_number} Week {week_number} - Status code: {response.status_code}")
return False
def get_seasons(self):
response = self.rate_limited_request(self.base_url)
soup = BeautifulSoup(response.text, 'html.parser')
seasons = set() # use a set to avoid duplicates
# Find the season navigation section - looking for text that starts with "Season"
season_text = soup.find(string=lambda t: t and t.strip().startswith('Season'))
if season_text:
parent = season_text.parent
# Get all links and the bold span that follow "Season"
elements = parent.find_all(['a', 'span'])
for element in elements:
if element.name == 'a':
try:
seasons.add(int(element.text))
except ValueError:
continue
elif element.name == 'span' and element.get('style') == 'font-weight:bold;':
try:
seasons.add(int(element.text))
except ValueError:
continue
if not seasons:
self.debug_print("Warning: No seasons found in normal parsing")
# Fallback: try to find any numbers in season links
season_links = soup.find_all('a', href=lambda h: h and 'season=' in h)
for link in season_links:
match = re.search(r'season=(\d+)', link['href'])
if match:
seasons.add(int(match.group(1)))
# Always include season 1
seasons.add(1)
sorted_seasons = sorted(list(seasons))
self.debug_print(f"Found seasons: {sorted_seasons}")
if len(sorted_seasons) < 2:
self.debug_print("Warning: Found unusually few seasons, might indicate parsing error")
return sorted_seasons
def test_parse(self):
"""Test parsing on the first page"""
response = self.rate_limited_request(self.base_url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find first patch link
first_patch = soup.find('a', href=lambda href: href and href.endswith('.bps'))
if first_patch:
self.debug_print(f"Test found patch link: {first_patch['href']}")
else:
self.debug_print("Test could not find any patch links")
# Find first week span
first_week = soup.find('span', style='font-size:18px;')
if first_week:
self.debug_print(f"Test found week span: {first_week.text}")
number_images = first_week.find_all('img')
self.debug_print(f"Number images found: {len(number_images)}")
for img in number_images:
self.debug_print(f"Image source: {img['src']}")
else:
self.debug_print("Test could not find any week spans")
def scrape_season(self, season_number):
url = f"{self.base_url}?season={season_number}"
print(f"\nScraping Season {season_number}")
response = self.rate_limited_request(url)
soup = BeautifulSoup(response.text, 'html.parser')
downloads_this_season = 0
# Find all info divs
info_divs = soup.find_all('div', class_='info')
for info_div in info_divs:
# Check if this div contains a week number
week_num = self.get_week_number(info_div)
if week_num is None:
continue
self.debug_print(f"Processing Week {week_num}")
# Look for the patch link in the next table cell
table_cell = info_div.find_next('td', valign='top', align='right')
if table_cell:
patch_link = table_cell.find('a', href=lambda href: href and href.endswith('.bps'))
if patch_link:
self.debug_print(f"Found patch link: {patch_link['href']}")
patch_url = urljoin("https://www.romhackraces.com/", patch_link['href'])
self.debug_print(f"Full patch URL: {patch_url}")
if self.download_patch(patch_url, week_num, season_number):
downloads_this_season += 1
else:
self.debug_print(f"No patch link found for Week {week_num}")
else:
self.debug_print(f"No table cell found for Week {week_num}")
self.debug_print(f"Downloads this season: {downloads_this_season}")
return downloads_this_season
def scrape_all_seasons(self):
self.test_parse()
seasons = self.get_seasons()
print(f"Found {len(seasons)} seasons to scrape")
total_downloads = 0
last_season_checked = self.download_history["last_season_checked"]
for season in seasons:
if season < last_season_checked:
self.debug_print(f"Skipping Season {season} - already checked")
continue
downloads = self.scrape_season(season)
total_downloads += downloads
self.download_history["last_season_checked"] = max(
season,
self.download_history["last_season_checked"]
)
self.download_history["last_update"] = datetime.now().isoformat()
self.save_download_history()
print(f"\nDownload session complete. Downloaded {total_downloads} new patches.")
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Romhack Race Scraper')
parser.add_argument('-d', '--debug', action='store_true', help='Enable debug output')
args = parser.parse_args()
scraper = RomhackRaceScraper()
scraper.debug = args.debug # Set debug flag based on command line argument
# Check if we have existing downloads
if os.path.exists("download_history.json"):
print("Found existing download history")
print(f"Last update: {scraper.download_history['last_update']}")
print(f"Previously downloaded patches: {len(scraper.download_history['downloaded_patches'])}")
print("Checking for new patches...\n")
else:
print("No download history found. Will download all patches.\n")
scraper.scrape_all_seasons()
if __name__ == '__main__':
main()