diff --git a/harvester/proxy.py b/harvester/proxy.py index be52075..979a611 100644 --- a/harvester/proxy.py +++ b/harvester/proxy.py @@ -6,6 +6,22 @@ import requests def fetch_list(url): + """Fetches proxy list from the given URL. + + The HTTP response text will be searched for `ip:port` and `username:password@ip:port` + patterns to accommodate different source list formats. + + If an error occurs while fetching the list, it will be logged with WARNING + and an empty list will be returned. + + Args: + url (str): The URL to fetch proxy list from. + + Returns: + list: A list of proxy server addresses fetched from the URL. + If an error occurs while fetching the list, it will be logged + with WARNING and an empty list will be returned. + """ try: response = requests.get(url) response.raise_for_status() @@ -14,9 +30,9 @@ def fetch_list(url): return [] proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+" - proxies = re.findall(proxy_regex, response.text) + proxies = set(re.findall(proxy_regex, response.text)) logging.info(f'Fetched {len(proxies)} proxies from {url}') - return proxies + return list(proxies) def fetch_all(urls, max_workers=8):