rhrtools/downloader.py

import requests
from bs4 import BeautifulSoup
import os
import re
import json
import time
from urllib.parse import urljoin
from datetime import datetime

class RomhackRaceScraper:
    def __init__(self):
        self.base_url = "https://www.romhackraces.com/levels.php"
        self.session = requests.Session()
        self.rate_limit = 1
        self.last_request = 0
        self.download_history_file = "download_history.json"
        self.download_history = self.load_download_history()
        self.debug = True

        os.makedirs('patches', exist_ok=True)

    def debug_print(self, message):
        if self.debug:
            print(f"DEBUG: {message}")

    def load_download_history(self):
        try:
            with open(self.download_history_file, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            return {
                "last_update": "",
                "downloaded_patches": {},
                "last_season_checked": 0
            }

    def save_download_history(self):
        with open(self.download_history_file, 'w') as f:
            json.dump(self.download_history, f, indent=2)

    def rate_limited_request(self, url):
        self.debug_print(f"Making request to: {url}")
        current_time = time.time()
        time_since_last = current_time - self.last_request
        if time_since_last < self.rate_limit:
            time.sleep(self.rate_limit - time_since_last)

        response = self.session.get(url)
        self.last_request = time.time()
        self.debug_print(f"Response status code: {response.status_code}")
        return response

    def get_week_number(self, element):
        """Extract week number from span element containing number images"""
        # First find the span with font-size:18px that contains 'Week'
        week_span = element.find('span', style='font-size:18px;')
        if not week_span or 'Week' not in week_span.text:
            return None

        # Get all number images in this span
        number_images = week_span.find_all('img')
        if not number_images:
            self.debug_print("Found week span but no number images")
            return None

        try:
            # Extract numbers from image filenames
            numbers = [img['src'].split('/')[-1].split('.')[0] for img in number_images]
            week_num = int(''.join(numbers))
            self.debug_print(f"Found week number: {week_num}")
            return week_num
        except (ValueError, KeyError, IndexError) as e:
            self.debug_print(f"Error parsing week number: {e}")
            return None

    def download_patch(self, url, week_number, season_number):
        patch_id = f"s{season_number}_w{week_number}"

        if patch_id in self.download_history["downloaded_patches"]:
            self.debug_print(f"Patch {patch_id} already downloaded")
            return False

        response = self.rate_limited_request(url)
        if response.status_code == 200:
            season_dir = os.path.join('patches', f"Season{season_number}")
            os.makedirs(season_dir, exist_ok=True)

            filename = f"Week{week_number}.bps"
            filepath = os.path.join(season_dir, filename)

            with open(filepath, 'wb') as f:
                f.write(response.content)

            self.download_history["downloaded_patches"][patch_id] = {
                "filename": filepath,
                "downloaded_at": datetime.now().isoformat(),
                "url": url
            }
            self.save_download_history()

            print(f"Downloaded Season {season_number} Week {week_number}")
            return True
        else:
            print(f"Failed to download Season {season_number} Week {week_number} - Status code: {response.status_code}")
            return False

    def get_seasons(self):
        response = self.rate_limited_request(self.base_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        seasons = set()  # use a set to avoid duplicates

        # Find the season navigation section - looking for text that starts with "Season"
        season_text = soup.find(text=lambda t: t and t.strip().startswith('Season'))
        if season_text:
            parent = season_text.parent

            # Get all links and the bold span that follow "Season"
            elements = parent.find_all(['a', 'span'])
            for element in elements:
                if element.name == 'a':
                    try:
                        seasons.add(int(element.text))
                    except ValueError:
                        continue
                elif element.name == 'span' and element.get('style') == 'font-weight:bold;':
                    try:
                        seasons.add(int(element.text))
                    except ValueError:
                        continue

        if not seasons:
            self.debug_print("Warning: No seasons found in normal parsing")
            # Fallback: try to find any numbers in season links
            season_links = soup.find_all('a', href=lambda h: h and 'season=' in h)
            for link in season_links:
                match = re.search(r'season=(\d+)', link['href'])
                if match:
                    seasons.add(int(match.group(1)))

        # Always include season 1
        seasons.add(1)

        sorted_seasons = sorted(list(seasons))
        self.debug_print(f"Found seasons: {sorted_seasons}")

        if len(sorted_seasons) < 2:
            self.debug_print("Warning: Found unusually few seasons, might indicate parsing error")

        return sorted_seasons

    def test_parse(self):
        """Test parsing on the first page"""
        response = self.rate_limited_request(self.base_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find first patch link
        first_patch = soup.find('a', href=lambda href: href and href.endswith('.bps'))
        if first_patch:
            self.debug_print(f"Test found patch link: {first_patch['href']}")
        else:
            self.debug_print("Test could not find any patch links")

        # Find first week span
        first_week = soup.find('span', style='font-size:18px;')
        if first_week:
            self.debug_print(f"Test found week span: {first_week.text}")
            number_images = first_week.find_all('img')
            self.debug_print(f"Number images found: {len(number_images)}")
            for img in number_images:
                self.debug_print(f"Image source: {img['src']}")
        else:
            self.debug_print("Test could not find any week spans")

    def scrape_season(self, season_number):
        url = f"{self.base_url}?season={season_number}"
        print(f"\nScraping Season {season_number}")

        response = self.rate_limited_request(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        downloads_this_season = 0

        # Find all info divs
        info_divs = soup.find_all('div', class_='info')

        for info_div in info_divs:
            # Check if this div contains a week number
            week_num = self.get_week_number(info_div)
            if week_num is None:
                continue

            self.debug_print(f"Processing Week {week_num}")

            # Look for the patch link in the next table cell
            table_cell = info_div.find_next('td', valign='top', align='right')
            if table_cell:
                patch_link = table_cell.find('a', href=lambda href: href and href.endswith('.bps'))
                if patch_link:
                    self.debug_print(f"Found patch link: {patch_link['href']}")
                    patch_url = urljoin("https://www.romhackraces.com/", patch_link['href'])
                    self.debug_print(f"Full patch URL: {patch_url}")

                    if self.download_patch(patch_url, week_num, season_number):
                        downloads_this_season += 1
                else:
                    self.debug_print(f"No patch link found for Week {week_num}")
            else:
                self.debug_print(f"No table cell found for Week {week_num}")

        self.debug_print(f"Downloads this season: {downloads_this_season}")
        return downloads_this_season

    def scrape_all_seasons(self):
        self.test_parse()
        seasons = self.get_seasons()
        print(f"Found {len(seasons)} seasons to scrape")

        total_downloads = 0
        last_season_checked = self.download_history["last_season_checked"]

        for season in seasons:
            if season < last_season_checked:
                self.debug_print(f"Skipping Season {season} - already checked")
                continue

            downloads = self.scrape_season(season)
            total_downloads += downloads

            self.download_history["last_season_checked"] = max(
                season,
                self.download_history["last_season_checked"]
            )
            self.download_history["last_update"] = datetime.now().isoformat()
            self.save_download_history()

        print(f"\nDownload session complete. Downloaded {total_downloads} new patches.")

def main():
    scraper = RomhackRaceScraper()

    # Check if we have existing downloads
    if os.path.exists("download_history.json"):
        print("Found existing download history")
        print(f"Last update: {scraper.download_history['last_update']}")
        print(f"Previously downloaded patches: {len(scraper.download_history['downloaded_patches'])}")
        print("Checking for new patches...\n")
    else:
        print("No download history found. Will download all patches.\n")

    scraper.scrape_all_seasons()

if __name__ == '__main__':
    main()