Add fetch_all method to fetch proxies from list of source URLs

This commit is contained in:
agatha 2023-11-06 16:22:38 -05:00
parent f263ef5884
commit 91ec19f659
4 changed files with 34 additions and 5 deletions

View File

@ -1,4 +1,5 @@
"""Proxy harvester module""" """Proxy harvester module"""
import concurrent.futures
import re import re
import requests import requests
@ -10,12 +11,28 @@ def fetch_list(url):
except requests.RequestException: except requests.RequestException:
return [] return []
# proxy_regex = r"(?:\d{1,3}\.){3}\d{1,3}:\d+"
proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+" proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+"
proxies = re.findall(proxy_regex, response.text) proxies = re.findall(proxy_regex, response.text)
return proxies return proxies
def fetch_all(urls, max_workers=8):
proxies = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {executor.submit(fetch_list, url): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
proxy_list = future.result()
for proxy in proxy_list:
if proxy not in proxies:
proxies.append(proxy)
except:
pass
return proxies
def validate(proxy, type='http'): def validate(proxy, type='http'):
# Check regex # Check regex
return None return None

5
tests/data/proxies2.txt Normal file
View File

@ -0,0 +1,5 @@
127.0.0.1:9000
garbage line lol
127.0.0.1:9001
username:pa$$@word@127.0.0.1:9002
127.0.0.1:9999

View File

@ -1,7 +1,7 @@
import subprocess import subprocess
import time import time
import pytest import pytest
from harvester.proxy import fetch_list from harvester.proxy import fetch_list, fetch_all
@pytest.fixture(scope='session', autouse=True) @pytest.fixture(scope='session', autouse=True)
@ -16,17 +16,24 @@ def start_web_server():
def test_fetch_list(): def test_fetch_list():
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002'] expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
result = fetch_list('http://localhost:8888/proxies.txt') result = fetch_list('http://localhost:8888/proxies1.txt')
assert result == expected assert result == expected
def test_fetch_list_fail(): def test_fetch_list_fail():
expected = [] expected = []
result = fetch_list('http://localhost:12345/proxies.txt') result = fetch_list('http://localhost:12345/proxies1.txt')
assert result == expected assert result == expected
def test_fetch_list_only_valid(): def test_fetch_list_only_valid():
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002'] expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
result = fetch_list('http://localhost:8888/proxies.txt') result = fetch_list('http://localhost:8888/proxies1.txt')
assert result == expected assert result == expected
def test_fetch_all():
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002', '127.0.0.1:9999']
result = fetch_all(['http://localhost:8888/proxies1.txt', 'http://localhost:8888/proxies2.txt'])
for proxy in expected:
assert proxy in result