Initial commit
This commit is contained in:
commit
80ea0b32be
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
.idea/
|
||||||
|
venv/
|
||||||
|
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
7
README.md
Normal file
7
README.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# Harvester
|
||||||
|
Python package for harvesting commonly available data, such as free proxy servers.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
```
|
||||||
|
pytest -v
|
||||||
|
```
|
0
harvester/__init__.py
Normal file
0
harvester/__init__.py
Normal file
21
harvester/proxy.py
Normal file
21
harvester/proxy.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
"""Proxy harvester module"""
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_list(url):
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.RequestException:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# proxy_regex = r"(?:\d{1,3}\.){3}\d{1,3}:\d+"
|
||||||
|
proxy_regex = r"(?:\b(?:[\S]+:)?(?:[\S]+)?@\b)?(?:\d{1,3}\.){3}\d{1,3}:\d+"
|
||||||
|
proxies = re.findall(proxy_regex, response.text)
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
|
def validate(proxy, type='http'):
|
||||||
|
# Check regex
|
||||||
|
return None
|
21
main.py
Normal file
21
main.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
"""Harvester: Proxy collection tool
|
||||||
|
Inspired by https://github.com/acidvegas/proxytools
|
||||||
|
"""
|
||||||
|
from harvester.proxy import fetch_list
|
||||||
|
|
||||||
|
|
||||||
|
URLS = [
|
||||||
|
'https://api.openproxylist.xyz/socks4.txt',
|
||||||
|
'https://api.openproxylist.xyz/socks5.txt',
|
||||||
|
'https://api.proxyscrape.com/?request=displayproxies&proxytype=socks4',
|
||||||
|
]
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
for url in URLS:
|
||||||
|
proxies = fetch_list(url)
|
||||||
|
print(proxies)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
1
requirements-dev.txt
Normal file
1
requirements-dev.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
pytest
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
requests
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
4
tests/data/proxies.txt
Normal file
4
tests/data/proxies.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
127.0.0.1:9000
|
||||||
|
garbage line lol
|
||||||
|
127.0.0.1:9001
|
||||||
|
username:pa$$@word@127.0.0.1:9002
|
32
tests/test_harvester.py
Normal file
32
tests/test_harvester.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import pytest
|
||||||
|
from harvester.proxy import fetch_list
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope='session', autouse=True)
|
||||||
|
def start_web_server():
|
||||||
|
server_process = subprocess.Popen(['python', '-m', 'http.server', '8888'], cwd='tests/data')
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
server_process.terminate()
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_list():
|
||||||
|
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
|
||||||
|
result = fetch_list('http://localhost:8888/proxies.txt')
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_list_fail():
|
||||||
|
expected = []
|
||||||
|
result = fetch_list('http://localhost:12345/proxies.txt')
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_list_only_valid():
|
||||||
|
expected = ['127.0.0.1:9000', '127.0.0.1:9001', 'username:pa$$@word@127.0.0.1:9002']
|
||||||
|
result = fetch_list('http://localhost:8888/proxies.txt')
|
||||||
|
assert result == expected
|
Loading…
Reference in New Issue
Block a user