harvester/main.py

53 lines
1.5 KiB
Python

"""Harvester: Proxy collection tool
Inspired by https://github.com/acidvegas/proxytools
"""
import concurrent.futures
import logging
from harvester.proxy import fetch_all, validate_socks
def load_urls(path):
with open(path, 'r', encoding='utf-8') as file:
urls = [line.strip() for line in file.readlines()]
return urls
def main():
"""Main entry point."""
logging.basicConfig(level=logging.WARN)
# Load proxy source list and fetch proxies
urls = load_urls('data/proxy-sources.txt')
proxies = fetch_all(urls)
print(f'Fetched {len(proxies)} proxies!')
# Concurrently validate proxies with ThreadPoolExecutor
valid = []
with concurrent.futures.ThreadPoolExecutor(max_workers=128) as executor:
validate_futures = {executor.submit(validate_socks, proxy): proxy for proxy in proxies}
for future in concurrent.futures.as_completed(validate_futures):
proxy = validate_futures[future]
try:
response = future.result()
response.raise_for_status()
except Exception as exception:
# TODO: Handle exceptions differently. See issues.
logging.info(str(exception))
continue
ip = response.text.strip()
valid.append(proxy)
print(f'{proxy} -> {ip}')
with open('data/valid-socks.txt', 'w', encoding='utf-8') as file:
file.write('\n'.join(valid))
for proxy in valid:
print(proxy)
if __name__ == '__main__':
main()