harvester/harvester/proxy.py

70 lines
2.2 KiB
Python

"""Proxy harvester module"""
import concurrent.futures
import logging
import re
import requests
def fetch_list(url):
"""Fetches proxy list from the given URL.
The HTTP response text will be searched for `ip:port` and `username:password@ip:port`
patterns to accommodate different source list formats.
If an error occurs while fetching the list, it will be logged with WARNING
and an empty list will be returned.
Args:
url (str): The URL to fetch proxy list from.
Returns:
list: A list of proxy server addresses fetched from the URL.
If an error occurs while fetching the list, it will be logged
with WARNING and an empty list will be returned.
"""
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException:
logging.warning(f'Error fetching proxies from {url}')
return []
proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+"
proxies = set(re.findall(proxy_regex, response.text))
logging.info(f'Fetched {len(proxies)} proxies from {url}')
return list(proxies)
def fetch_all(urls, max_workers=8):
"""Fetches proxy server addresses from multiple URLs concurrently.
Args:
urls (list): A list of URLs to fetch proxy server addresses from.
max_workers (int, optional): The maximum number of worker threads to use.
Defaults to 8.
Returns:
list: A list of unique proxy server addresses fetched from all the URLs.
"""
proxies = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {executor.submit(fetch_list, url): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
proxy_list = future.result()
for proxy in proxy_list:
if proxy not in proxies:
proxies.append(proxy)
except:
pass
logging.info(f'Fetched {len(proxies)} proxies in total')
return proxies
def validate(proxy, type='http'):
# Check regex
return None