commit 80ea0b32be29d7618281e881bda2dfc61be2e7f3 Author: agatha Date: Mon Nov 6 14:41:05 2023 -0500 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..646a1a6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.idea/ +venv/ + +__pycache__/ +*.py[cod] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..8ad6cda --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# Harvester +Python package for harvesting commonly available data, such as free proxy servers. + +## Testing +``` +pytest -v +``` diff --git a/harvester/__init__.py b/harvester/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/harvester/proxy.py b/harvester/proxy.py new file mode 100644 index 0000000..7854f88 --- /dev/null +++ b/harvester/proxy.py @@ -0,0 +1,21 @@ +"""Proxy harvester module""" +import re +import requests + + +def fetch_list(url): + try: + response = requests.get(url) + response.raise_for_status() + except requests.RequestException: + return [] + + # proxy_regex = r"(?:\d{1,3}\.){3}\d{1,3}:\d+" + proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+" + proxies = re.findall(proxy_regex, response.text) + return proxies + + +def validate(proxy, type='http'): + # Check regex + return None diff --git a/main.py b/main.py new file mode 100644 index 0000000..14ea066 --- /dev/null +++ b/main.py @@ -0,0 +1,21 @@ +"""Harvester: Proxy collection tool +Inspired by https://github.com/acidvegas/proxytools +""" +from harvester.proxy import fetch_list + + +URLS = [ + 'https://api.openproxylist.xyz/socks4.txt', + 'https://api.openproxylist.xyz/socks5.txt', + 'https://api.proxyscrape.com/?request=displayproxies&proxytype=socks4', +] + +def main(): + """Main entry point.""" + for url in URLS: + proxies = fetch_list(url) + print(proxies) + + +if __name__ == '__main__': + main() diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..55b033e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pytest \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..663bd1f --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/proxies.txt b/tests/data/proxies.txt new file mode 100644 index 0000000..a0f5a15 --- /dev/null +++ b/tests/data/proxies.txt @@ -0,0 +1,4 @@ +127.0.0.1:9000 +garbage line lol +127.0.0.1:9001 +username:pa$$@word@127.0.0.1:9002 diff --git a/tests/test_harvester.py b/tests/test_harvester.py new file mode 100644 index 0000000..a34e00b --- /dev/null +++ b/tests/test_harvester.py @@ -0,0 +1,32 @@ +import subprocess +import time +import pytest +from harvester.proxy import fetch_list + + +@pytest.fixture(scope='session', autouse=True) +def start_web_server(): + server_process = subprocess.Popen(['python', '-m', 'http.server', '8888'], cwd='tests/data') + time.sleep(1) + + yield + + server_process.terminate() + + +def test_fetch_list(): + expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002'] + result = fetch_list('http://localhost:8888/proxies.txt') + assert result == expected + + +def test_fetch_list_fail(): + expected = [] + result = fetch_list('http://localhost:12345/proxies.txt') + assert result == expected + + +def test_fetch_list_only_valid(): + expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002'] + result = fetch_list('http://localhost:8888/proxies.txt') + assert result == expected