import requests from bs4 import BeautifulSoup def get_total_pages(url): """Get the total page count.""" html = fetch_page_html(url) soup = BeautifulSoup(html, 'html.parser') pagination_element = soup.find('ul', class_='pagination') page_items = pagination_element.find_all('li', class_='page-item') return len(page_items) def fetch_page_html(url): """Fetch a page's HTML""" response = requests.get(url) if response.status_code == 200: return response.content else: raise Exception(f'Failed to fetch HTML from {url}') def extract_media_urls(html): """Extracts all media URLs from page HTML.""" soup = BeautifulSoup(html, 'html.parser') download_links = soup.find_all('a', attrs={'download': True}) media_urls = [link['href'] for link in download_links] return media_urls def get_all_media_urls(base_url): """Extracts all media URLs from a Podcast Generator site.""" total_pages = get_total_pages(base_url) media_urls = [] for page_number in range(1, total_pages + 1): page_url = f'{base_url}/index.php?page={page_number}' html = fetch_page_html(page_url) media_links = extract_media_urls(html) for link in media_links: media_urls.append(f'{base_url}/{link}') return media_urls def main(): base_url = 'https://apcs.rhonk.cloud' media_urls = get_all_media_urls(base_url) with open('urls.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(media_urls)) if __name__ == '__main__': main()