podgen-url-scraper/main.py

import requests
from bs4 import BeautifulSoup


def get_total_pages(url):
    """Get the total page count."""
    html = fetch_page_html(url)
    soup = BeautifulSoup(html, 'html.parser')

    pagination_element = soup.find('ul', class_='pagination')
    page_items = pagination_element.find_all('li', class_='page-item')

    return len(page_items)


def fetch_page_html(url):
    """Fetch a page's HTML"""
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception(f'Failed to fetch HTML from {url}')


def extract_media_urls(html):
    """Extracts all media URLs from page HTML."""
    soup = BeautifulSoup(html, 'html.parser')
    download_links = soup.find_all('a', attrs={'download': True})
    media_urls = [link['href'] for link in download_links]
    return media_urls


def get_all_media_urls(base_url):
    """Extracts all media URLs from a Podcast Generator site."""
    total_pages = get_total_pages(base_url)

    media_urls = []
    for page_number in range(1, total_pages + 1):
        page_url = f'{base_url}/index.php?page={page_number}'
        html = fetch_page_html(page_url)
        media_links = extract_media_urls(html)

        for link in media_links:
            media_urls.append(f'{base_url}/{link}')

    return media_urls


def main():
    base_url = 'https://apcs.rhonk.cloud'
    media_urls = get_all_media_urls(base_url)
    with open('urls.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(media_urls))


if __name__ == '__main__':
    main()