Initial commit

This commit is contained in:
agatha 2023-11-06 14:41:05 -05:00
commit 80ea0b32be
10 changed files with 92 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
.idea/
venv/
__pycache__/
*.py[cod]

7
README.md Normal file
View File

@ -0,0 +1,7 @@
# Harvester
Python package for harvesting commonly available data, such as free proxy servers.
## Testing
```
pytest -v
```

0
harvester/__init__.py Normal file
View File

21
harvester/proxy.py Normal file
View File

@ -0,0 +1,21 @@
"""Proxy harvester module"""
import re
import requests
def fetch_list(url):
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException:
return []
# proxy_regex = r"(?:\d{1,3}\.){3}\d{1,3}:\d+"
proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+"
proxies = re.findall(proxy_regex, response.text)
return proxies
def validate(proxy, type='http'):
# Check regex
return None

21
main.py Normal file
View File

@ -0,0 +1,21 @@
"""Harvester: Proxy collection tool
Inspired by https://github.com/acidvegas/proxytools
"""
from harvester.proxy import fetch_list
URLS = [
'https://api.openproxylist.xyz/socks4.txt',
'https://api.openproxylist.xyz/socks5.txt',
'https://api.proxyscrape.com/?request=displayproxies&proxytype=socks4',
]
def main():
"""Main entry point."""
for url in URLS:
proxies = fetch_list(url)
print(proxies)
if __name__ == '__main__':
main()

1
requirements-dev.txt Normal file
View File

@ -0,0 +1 @@
pytest

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
requests

0
tests/__init__.py Normal file
View File

4
tests/data/proxies.txt Normal file
View File

@ -0,0 +1,4 @@
127.0.0.1:9000
garbage line lol
127.0.0.1:9001
username:pa$$@word@127.0.0.1:9002

32
tests/test_harvester.py Normal file
View File

@ -0,0 +1,32 @@
import subprocess
import time
import pytest
from harvester.proxy import fetch_list
@pytest.fixture(scope='session', autouse=True)
def start_web_server():
server_process = subprocess.Popen(['python', '-m', 'http.server', '8888'], cwd='tests/data')
time.sleep(1)
yield
server_process.terminate()
def test_fetch_list():
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
result = fetch_list('http://localhost:8888/proxies.txt')
assert result == expected
def test_fetch_list_fail():
expected = []
result = fetch_list('http://localhost:12345/proxies.txt')
assert result == expected
def test_fetch_list_only_valid():
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
result = fetch_list('http://localhost:8888/proxies.txt')
assert result == expected