70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
"""Proxy harvester module"""
|
|
import concurrent.futures
|
|
import logging
|
|
import re
|
|
import requests
|
|
|
|
|
|
def fetch_list(url):
|
|
"""Fetches proxy list from the given URL.
|
|
|
|
The HTTP response text will be searched for `ip:port` and `username:password@ip:port`
|
|
patterns to accommodate different source list formats.
|
|
|
|
If an error occurs while fetching the list, it will be logged with WARNING
|
|
and an empty list will be returned.
|
|
|
|
Args:
|
|
url (str): The URL to fetch proxy list from.
|
|
|
|
Returns:
|
|
list: A list of proxy server addresses fetched from the URL.
|
|
If an error occurs while fetching the list, it will be logged
|
|
with WARNING and an empty list will be returned.
|
|
"""
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
except requests.RequestException:
|
|
logging.warning(f'Error fetching proxies from {url}')
|
|
return []
|
|
|
|
proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+"
|
|
proxies = set(re.findall(proxy_regex, response.text))
|
|
logging.info(f'Fetched {len(proxies)} proxies from {url}')
|
|
return list(proxies)
|
|
|
|
|
|
def fetch_all(urls, max_workers=8):
|
|
"""Fetches proxy server addresses from multiple URLs concurrently.
|
|
|
|
Args:
|
|
urls (list): A list of URLs to fetch proxy server addresses from.
|
|
max_workers (int, optional): The maximum number of worker threads to use.
|
|
Defaults to 8.
|
|
|
|
Returns:
|
|
list: A list of unique proxy server addresses fetched from all the URLs.
|
|
|
|
"""
|
|
proxies = []
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
future_to_url = {executor.submit(fetch_list, url): url for url in urls}
|
|
for future in concurrent.futures.as_completed(future_to_url):
|
|
url = future_to_url[future]
|
|
try:
|
|
proxy_list = future.result()
|
|
for proxy in proxy_list:
|
|
if proxy not in proxies:
|
|
proxies.append(proxy)
|
|
except:
|
|
pass
|
|
|
|
logging.info(f'Fetched {len(proxies)} proxies in total')
|
|
return proxies
|
|
|
|
|
|
def validate(proxy, type='http'):
|
|
# Check regex
|
|
return None
|