harvester/main.py

103 lines
3.2 KiB
Python
Raw Normal View History

2023-11-06 19:41:05 +00:00
"""Harvester: Proxy collection tool
Inspired by https://github.com/acidvegas/proxytools
"""
2023-11-08 01:30:37 +00:00
import time
2023-11-07 23:53:07 +00:00
import concurrent.futures
2023-11-06 21:47:22 +00:00
import logging
2024-09-21 21:36:01 +00:00
import os
2024-09-22 18:15:01 +00:00
from datetime import datetime
from harvester.db.models import Proxy
from harvester.db.schema import init_db
from harvester.db.session import SessionFactory
2023-11-07 23:53:07 +00:00
from harvester.proxy import fetch_all, validate_socks
2023-11-06 19:41:05 +00:00
2024-09-22 18:15:01 +00:00
DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///proxies.db')
init_db(DATABASE_URL)
session_factory = SessionFactory(DATABASE_URL)
2023-11-06 19:41:05 +00:00
2023-11-09 22:52:44 +00:00
def read_file(path):
2023-11-07 23:53:07 +00:00
with open(path, 'r', encoding='utf-8') as file:
2023-11-09 22:52:44 +00:00
data = [line.strip() for line in file.readlines()]
2023-11-07 23:53:07 +00:00
2023-11-09 22:52:44 +00:00
return data
2023-11-07 23:53:07 +00:00
2023-11-06 19:41:05 +00:00
2023-11-08 01:30:37 +00:00
def write_file(path, data):
with open(path, 'w', encoding='utf-8') as file:
file.write(data)
def gather_proxies():
2023-11-07 23:53:07 +00:00
# Load proxy source list and fetch proxies
2023-11-09 22:52:44 +00:00
urls = read_file('data/proxy-sources.txt')
2023-11-07 23:53:07 +00:00
proxies = fetch_all(urls)
print(f'Fetched {len(proxies)} proxies!')
# Concurrently validate proxies with ThreadPoolExecutor
valid = []
with concurrent.futures.ThreadPoolExecutor(max_workers=128) as executor:
validate_futures = {executor.submit(validate_socks, proxy): proxy for proxy in proxies}
for future in concurrent.futures.as_completed(validate_futures):
proxy = validate_futures[future]
try:
response = future.result()
response.raise_for_status()
except Exception as exception:
2023-11-08 01:30:37 +00:00
# TODO: Handle exceptions differently. See https://git.juggalol.com/agatha/harvester/issues/1.
2023-11-07 23:53:07 +00:00
logging.info(str(exception))
continue
ip = response.text.strip()
valid.append(proxy)
print(f'{proxy} -> {ip}')
2024-09-22 18:15:01 +00:00
# Save to DB
session = session_factory.create_session()
existing_proxy = session.query(Proxy).filter_by(host=proxy.split(':')[0],
port=int(proxy.split(':')[1])).first()
if existing_proxy:
existing_proxy.date_validated = datetime.now()
existing_proxy.egress_ip = ip
else:
new_proxy = Proxy(
host=proxy.split(':')[0],
port=int(proxy.split(':')[1]),
egress_ip=ip,
date_added=datetime.now(),
date_validated=datetime.now(),
)
session.add(new_proxy)
2024-09-22 18:15:01 +00:00
session.commit()
2024-09-21 21:36:01 +00:00
# Create output directory if it does not exist
if not os.path.exists('proxies'):
os.makedirs('proxies')
2023-11-08 01:30:37 +00:00
# Write to file with timestamp
write_file(
path=f'proxies/valid-socks-{time.strftime("%Y%m%d%H%M%S")}.txt',
data='\n'.join(valid)
)
2023-11-09 22:52:44 +00:00
# Write proxychains conf
proxychains_template = read_file('templates/proxychains.conf')
proxychains_data = [f'socks5 {proxy.replace(":", " ")}' for proxy in proxies]
write_file(
path=f'proxies/proxychains-{time.strftime("%Y%m%d%H%M%S")}.conf',
data='\n'.join(proxychains_template + proxychains_data)
)
2023-11-06 19:41:05 +00:00
def main():
"""Main entry point."""
logging.basicConfig(level=logging.WARN)
gather_proxies()
2023-11-06 19:41:05 +00:00
if __name__ == '__main__':
main()