2023-11-06 19:41:05 +00:00
|
|
|
"""Harvester: Proxy collection tool
|
|
|
|
Inspired by https://github.com/acidvegas/proxytools
|
|
|
|
"""
|
2023-11-08 01:30:37 +00:00
|
|
|
import time
|
2023-11-07 23:53:07 +00:00
|
|
|
import concurrent.futures
|
2023-11-06 21:47:22 +00:00
|
|
|
import logging
|
2023-11-07 23:53:07 +00:00
|
|
|
from harvester.proxy import fetch_all, validate_socks
|
2023-11-06 19:41:05 +00:00
|
|
|
|
|
|
|
|
2023-11-07 23:53:07 +00:00
|
|
|
def load_urls(path):
|
|
|
|
with open(path, 'r', encoding='utf-8') as file:
|
|
|
|
urls = [line.strip() for line in file.readlines()]
|
|
|
|
|
|
|
|
return urls
|
|
|
|
|
2023-11-06 19:41:05 +00:00
|
|
|
|
2023-11-08 01:30:37 +00:00
|
|
|
def write_file(path, data):
|
|
|
|
with open(path, 'w', encoding='utf-8') as file:
|
|
|
|
file.write(data)
|
|
|
|
|
|
|
|
|
2023-11-06 19:41:05 +00:00
|
|
|
def main():
|
|
|
|
"""Main entry point."""
|
2023-11-07 23:53:07 +00:00
|
|
|
logging.basicConfig(level=logging.WARN)
|
|
|
|
|
|
|
|
# Load proxy source list and fetch proxies
|
|
|
|
urls = load_urls('data/proxy-sources.txt')
|
|
|
|
proxies = fetch_all(urls)
|
|
|
|
print(f'Fetched {len(proxies)} proxies!')
|
|
|
|
|
|
|
|
# Concurrently validate proxies with ThreadPoolExecutor
|
|
|
|
valid = []
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=128) as executor:
|
|
|
|
validate_futures = {executor.submit(validate_socks, proxy): proxy for proxy in proxies}
|
|
|
|
for future in concurrent.futures.as_completed(validate_futures):
|
|
|
|
proxy = validate_futures[future]
|
|
|
|
|
|
|
|
try:
|
|
|
|
response = future.result()
|
|
|
|
response.raise_for_status()
|
|
|
|
except Exception as exception:
|
2023-11-08 01:30:37 +00:00
|
|
|
# TODO: Handle exceptions differently. See https://git.juggalol.com/agatha/harvester/issues/1.
|
2023-11-07 23:53:07 +00:00
|
|
|
logging.info(str(exception))
|
|
|
|
continue
|
|
|
|
|
|
|
|
ip = response.text.strip()
|
|
|
|
valid.append(proxy)
|
|
|
|
print(f'{proxy} -> {ip}')
|
|
|
|
|
2023-11-08 01:30:37 +00:00
|
|
|
# Write to file with timestamp
|
|
|
|
write_file(
|
|
|
|
path=f'proxies/valid-socks-{time.strftime("%Y%m%d%H%M%S")}.txt',
|
|
|
|
data='\n'.join(valid)
|
|
|
|
)
|
2023-11-07 23:53:07 +00:00
|
|
|
for proxy in valid:
|
|
|
|
print(proxy)
|
2023-11-06 19:41:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|