diff --git a/harvester/proxy.py b/harvester/proxy.py index 7854f88..f5a2407 100644 --- a/harvester/proxy.py +++ b/harvester/proxy.py @@ -1,4 +1,5 @@ """Proxy harvester module""" +import concurrent.futures import re import requests @@ -10,12 +11,28 @@ def fetch_list(url): except requests.RequestException: return [] - # proxy_regex = r"(?:\d{1,3}\.){3}\d{1,3}:\d+" proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+" proxies = re.findall(proxy_regex, response.text) return proxies +def fetch_all(urls, max_workers=8): + proxies = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_url = {executor.submit(fetch_list, url): url for url in urls} + for future in concurrent.futures.as_completed(future_to_url): + url = future_to_url[future] + try: + proxy_list = future.result() + for proxy in proxy_list: + if proxy not in proxies: + proxies.append(proxy) + except: + pass + + return proxies + + def validate(proxy, type='http'): # Check regex return None diff --git a/tests/data/proxies.txt b/tests/data/proxies1.txt similarity index 100% rename from tests/data/proxies.txt rename to tests/data/proxies1.txt diff --git a/tests/data/proxies2.txt b/tests/data/proxies2.txt new file mode 100644 index 0000000..2ca6828 --- /dev/null +++ b/tests/data/proxies2.txt @@ -0,0 +1,5 @@ +127.0.0.1:9000 +garbage line lol +127.0.0.1:9001 +username:pa$$@word@127.0.0.1:9002 +127.0.0.1:9999 \ No newline at end of file diff --git a/tests/test_harvester.py b/tests/test_harvester.py index a34e00b..d8ad2e8 100644 --- a/tests/test_harvester.py +++ b/tests/test_harvester.py @@ -1,7 +1,7 @@ import subprocess import time import pytest -from harvester.proxy import fetch_list +from harvester.proxy import fetch_list, fetch_all @pytest.fixture(scope='session', autouse=True) @@ -16,17 +16,24 @@ def start_web_server(): def test_fetch_list(): expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002'] - result = fetch_list('http://localhost:8888/proxies.txt') + result = fetch_list('http://localhost:8888/proxies1.txt') assert result == expected def test_fetch_list_fail(): expected = [] - result = fetch_list('http://localhost:12345/proxies.txt') + result = fetch_list('http://localhost:12345/proxies1.txt') assert result == expected def test_fetch_list_only_valid(): expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002'] - result = fetch_list('http://localhost:8888/proxies.txt') + result = fetch_list('http://localhost:8888/proxies1.txt') assert result == expected + + +def test_fetch_all(): + expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002', '127.0.0.1:9999'] + result = fetch_all(['http://localhost:8888/proxies1.txt', 'http://localhost:8888/proxies2.txt']) + for proxy in expected: + assert proxy in result