Initial commit

2023-10-01 14:56:18 -04:00 · 2023-10-01 14:56:18 -04:00 · e30a03f9a2
commit e30a03f9a2
4 changed files with 65 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 .idea/
 venv/
 urls.txt
--- a/README.md
+++ b/README.md
@ -0,0 +1,2 @@
 # Podcast Generator URL Scraper
 Shitty code to scrape all media URLs from a site generated with Podcast Generator.
--- a/main.py
+++ b/main.py
@ -0,0 +1,57 @@
 import requests
 from bs4 import BeautifulSoup
 def get_total_pages(url):
    """Get the total page count."""
    html = fetch_page_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    pagination_element = soup.find('ul', class_='pagination')
    page_items = pagination_element.find_all('li', class_='page-item')
    return len(page_items)
 def fetch_page_html(url):
    """Fetch a page's HTML"""
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception(f'Failed to fetch HTML from {url}')
 def extract_media_urls(html):
    """Extracts all media URLs from page HTML."""
    soup = BeautifulSoup(html, 'html.parser')
    download_links = soup.find_all('a', attrs={'download': True})
    media_urls = [link['href'] for link in download_links]
    return media_urls
 def get_all_media_urls(base_url):
    """Extracts all media URLs from a Podcast Generator site."""
    total_pages = get_total_pages(base_url)
    media_urls = []
    for page_number in range(1, total_pages + 1):
        page_url = f'{base_url}/index.php?page={page_number}'
        html = fetch_page_html(page_url)
        media_links = extract_media_urls(html)
        for link in media_links:
            media_urls.append(f'{base_url}/{link}')
    return media_urls
 def main():
    base_url = 'https://apcs.rhonk.cloud'
    media_urls = get_all_media_urls(base_url)
    with open('urls.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(media_urls))
 if __name__ == '__main__':
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 requests
 beautifulsoup4
		`@ -0,0 +1,2 @@`
							`# Podcast Generator URL Scraper`
							`Shitty code to scrape all media URLs from a site generated with Podcast Generator.`