"""Harvester: Proxy collection tool Inspired by https://github.com/acidvegas/proxytools """ import time import concurrent.futures import logging import os from datetime import datetime from harvester.db.models import Proxy from harvester.db.schema import init_db from harvester.db.session import SessionFactory from harvester.proxy import fetch_all, validate_socks DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///proxies.db') init_db(DATABASE_URL) session_factory = SessionFactory(DATABASE_URL) def read_file(path): with open(path, 'r', encoding='utf-8') as file: data = [line.strip() for line in file.readlines()] return data def write_file(path, data): with open(path, 'w', encoding='utf-8') as file: file.write(data) def validate_proxies(): pass def gather_proxies(): # Load proxy source list and fetch proxies urls = read_file('data/proxy-sources.txt') proxies = fetch_all(urls) print(f'Fetched {len(proxies)} proxies!') # Concurrently validate proxies with ThreadPoolExecutor valid = [] with concurrent.futures.ThreadPoolExecutor(max_workers=128) as executor: validate_futures = {executor.submit(validate_socks, proxy): proxy for proxy in proxies} for future in concurrent.futures.as_completed(validate_futures): proxy = validate_futures[future] try: response = future.result() response.raise_for_status() except Exception as exception: # TODO: Handle exceptions differently. See https://git.juggalol.com/agatha/harvester/issues/1. logging.info(str(exception)) continue ip = response.text.strip() valid.append(proxy) print(f'{proxy} -> {ip}') # Save to DB session = session_factory.create_session() existing_proxy = session.query(Proxy).filter_by(host=proxy.split(':')[0], port=int(proxy.split(':')[1])).first() if existing_proxy: existing_proxy.date_validated = datetime.now() existing_proxy.egress_ip = ip else: new_proxy = Proxy( host=proxy.split(':')[0], port=int(proxy.split(':')[1]), egress_ip=ip, date_added=datetime.now(), date_validated=datetime.now(), ) session.add(new_proxy) session.commit() # Create output directory if it does not exist if not os.path.exists('proxies'): os.makedirs('proxies') # Write to file with timestamp write_file( path=f'proxies/valid-socks-{time.strftime("%Y%m%d%H%M%S")}.txt', data='\n'.join(valid) ) # Write proxychains conf proxychains_template = read_file('templates/proxychains.conf') proxychains_data = [f'socks5 {proxy.replace(":", " ")}' for proxy in proxies] write_file( path=f'proxies/proxychains-{time.strftime("%Y%m%d%H%M%S")}.conf', data='\n'.join(proxychains_template + proxychains_data) ) def main(): """Main entry point.""" logging.basicConfig(level=logging.WARN) gather_proxies() if __name__ == '__main__': main()