Add fetch_all method to fetch proxies from list of source URLs
This commit is contained in:
parent
f263ef5884
commit
91ec19f659
@ -1,4 +1,5 @@
|
|||||||
"""Proxy harvester module"""
|
"""Proxy harvester module"""
|
||||||
|
import concurrent.futures
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@ -10,12 +11,28 @@ def fetch_list(url):
|
|||||||
except requests.RequestException:
|
except requests.RequestException:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# proxy_regex = r"(?:\d{1,3}\.){3}\d{1,3}:\d+"
|
|
||||||
proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+"
|
proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+"
|
||||||
proxies = re.findall(proxy_regex, response.text)
|
proxies = re.findall(proxy_regex, response.text)
|
||||||
return proxies
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_all(urls, max_workers=8):
|
||||||
|
proxies = []
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
future_to_url = {executor.submit(fetch_list, url): url for url in urls}
|
||||||
|
for future in concurrent.futures.as_completed(future_to_url):
|
||||||
|
url = future_to_url[future]
|
||||||
|
try:
|
||||||
|
proxy_list = future.result()
|
||||||
|
for proxy in proxy_list:
|
||||||
|
if proxy not in proxies:
|
||||||
|
proxies.append(proxy)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
def validate(proxy, type='http'):
|
def validate(proxy, type='http'):
|
||||||
# Check regex
|
# Check regex
|
||||||
return None
|
return None
|
||||||
|
5
tests/data/proxies2.txt
Normal file
5
tests/data/proxies2.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
127.0.0.1:9000
|
||||||
|
garbage line lol
|
||||||
|
127.0.0.1:9001
|
||||||
|
username:pa$$@word@127.0.0.1:9002
|
||||||
|
127.0.0.1:9999
|
@ -1,7 +1,7 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
import pytest
|
import pytest
|
||||||
from harvester.proxy import fetch_list
|
from harvester.proxy import fetch_list, fetch_all
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session', autouse=True)
|
@pytest.fixture(scope='session', autouse=True)
|
||||||
@ -16,17 +16,24 @@ def start_web_server():
|
|||||||
|
|
||||||
def test_fetch_list():
|
def test_fetch_list():
|
||||||
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
|
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
|
||||||
result = fetch_list('http://localhost:8888/proxies.txt')
|
result = fetch_list('http://localhost:8888/proxies1.txt')
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
def test_fetch_list_fail():
|
def test_fetch_list_fail():
|
||||||
expected = []
|
expected = []
|
||||||
result = fetch_list('http://localhost:12345/proxies.txt')
|
result = fetch_list('http://localhost:12345/proxies1.txt')
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
def test_fetch_list_only_valid():
|
def test_fetch_list_only_valid():
|
||||||
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
|
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
|
||||||
result = fetch_list('http://localhost:8888/proxies.txt')
|
result = fetch_list('http://localhost:8888/proxies1.txt')
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_all():
|
||||||
|
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002', '127.0.0.1:9999']
|
||||||
|
result = fetch_all(['http://localhost:8888/proxies1.txt', 'http://localhost:8888/proxies2.txt'])
|
||||||
|
for proxy in expected:
|
||||||
|
assert proxy in result
|
||||||
|
Loading…
Reference in New Issue
Block a user