add validate_proxies stub

chore: refactor proxy gathering into dedicated function
feat: add alembic for database migrations
2024-09-22 15:31:20 -04:00 · 2024-09-22 15:25:13 -04:00 · 2024-09-22 15:08:55 -04:00 · 2024-09-22 15:02:22 -04:00 · 2024-09-22 14:15:01 -04:00 · 2024-09-22 14:13:06 -04:00
13 changed files with 357 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,6 @@ __pycache__/

 # proxies dev results
 proxies/
+
+# sqlite database
+*.db
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,2 @@
+[FORMAT]
+max-line-length = 120
--- a/alembic.ini
+++ b/alembic.ini
@ -0,0 +1,116 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+# Use forward slashes (/) also on windows to provide an os agnostic path
+script_location = migrations
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python>=3.9 or backports.zoneinfo library.
+# Any required deps can installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to ZoneInfo()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to migrations/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:migrations/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = sqlite:///proxies.db
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
--- a/harvester/db/init.py
+++ b/harvester/db/init.py
--- a/harvester/db/models.py
+++ b/harvester/db/models.py
@ -0,0 +1,21 @@
+from sqlalchemy import Column, Integer, String, DateTime
+from .schema import Base
+
+
+class Proxy(Base):
+    __tablename__ = 'proxies'
+
+    id = Column(Integer, primary_key=True)
+    host = Column(String, nullable=False)
+    port = Column(Integer, nullable=False)
+    asn = Column(String)
+    egress_ip = Column(String)
+    egress_asn = Column(String)
+    date_added = Column(DateTime, nullable=False)
+    date_validated = Column(DateTime)
+
+    def __repr__(self):
+        return (
+            f'Proxy(id={self.id}, host={self.host}, port={self.port}, '
+            f'egress_ip={self.egress_ip}, date_added={self.created_at})'
+        )
--- a/harvester/db/schema.py
+++ b/harvester/db/schema.py
@ -0,0 +1,10 @@
+from sqlalchemy import MetaData, create_engine
+from sqlalchemy.ext.declarative import declarative_base
+
+metadata = MetaData()
+Base = declarative_base(metadata=metadata)
+
+
+def init_db(engine_url):
+    engine = create_engine(engine_url)
+    Base.metadata.create_all(engine)
--- a/harvester/db/session.py
+++ b/harvester/db/session.py
@ -0,0 +1,26 @@
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+
+class SessionFactory:
+    def __init__(self, engine_url):
+        self.engine = create_engine(engine_url)
+        self.Session = sessionmaker(bind=self.engine)
+
+    def create_session(self):
+        return self.Session()
+
+
+# Create a singleton instance of the SessionFactory
+session_factory = None
+
+
+def get_session_factory(engine_url):
+    global session_factory
+    if session_factory is None:
+        session_factory = SessionFactory(engine_url)
+    return session_factory
+
+
+def get_session(engine_url):
+    return get_session_factory(engine_url).create_session()
--- a/main.py
+++ b/main.py
@ -5,8 +5,17 @@ import time
 import concurrent.futures
 import logging
 import os
+from datetime import datetime
+from harvester.db.models import Proxy
+from harvester.db.schema import init_db
+from harvester.db.session import SessionFactory
 from harvester.proxy import fetch_all, validate_socks

+DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///proxies.db')
+
+init_db(DATABASE_URL)
+session_factory = SessionFactory(DATABASE_URL)
+

 def read_file(path):
    with open(path, 'r', encoding='utf-8') as file:
@ -20,10 +29,11 @@ def write_file(path, data):
        file.write(data)


-def main():
-    """Main entry point."""
-    logging.basicConfig(level=logging.WARN)
+def validate_proxies():
+    pass

+
+def gather_proxies():
    # Load proxy source list and fetch proxies
    urls = read_file('data/proxy-sources.txt')
    proxies = fetch_all(urls)
@ -48,6 +58,26 @@ def main():
            valid.append(proxy)
            print(f'{proxy} -> {ip}')

+            # Save to DB
+            session = session_factory.create_session()
+            existing_proxy = session.query(Proxy).filter_by(host=proxy.split(':')[0],
+                                                            port=int(proxy.split(':')[1])).first()
+
+            if existing_proxy:
+                existing_proxy.date_validated = datetime.now()
+                existing_proxy.egress_ip = ip
+            else:
+                new_proxy = Proxy(
+                    host=proxy.split(':')[0],
+                    port=int(proxy.split(':')[1]),
+                    egress_ip=ip,
+                    date_added=datetime.now(),
+                    date_validated=datetime.now(),
+                )
+                session.add(new_proxy)
+
+            session.commit()
+
    # Create output directory if it does not exist
    if not os.path.exists('proxies'):
        os.makedirs('proxies')
@ -66,5 +96,11 @@ def main():
    )


+def main():
+    """Main entry point."""
+    logging.basicConfig(level=logging.WARN)
+    gather_proxies()
+
+
 if __name__ == '__main__':
    main()
--- a/migrations/README
+++ b/migrations/README
@ -0,0 +1 @@
+Generic single-database configuration.
--- a/migrations/env.py
+++ b/migrations/env.py
@ -0,0 +1,80 @@
+from logging.config import fileConfig
+
+from sqlalchemy import engine_from_config
+from sqlalchemy import pool
+
+from alembic import context
+
+from harvester.db.models import Base
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section, {}),
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection, target_metadata=target_metadata
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
--- a/migrations/script.py.mako
+++ b/migrations/script.py.mako
@ -0,0 +1,26 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}
--- a/migrations/versions/2ef68467c443_initial_migration.py
+++ b/migrations/versions/2ef68467c443_initial_migration.py
@ -0,0 +1,30 @@
+"""Initial migration
+
+Revision ID: 2ef68467c443
+Revises: 
+Create Date: 2024-09-22 15:06:36.590091
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '2ef68467c443'
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,4 @@
 requests
 requests[socks]
+sqlalchemy
+alembic
Author	SHA1	Message	Date
agatha	eb406281d2	add validate_proxies stub	2024-09-22 15:31:20 -04:00
agatha	117d88b92d	chore: refactor proxy gathering into dedicated function	2024-09-22 15:25:13 -04:00
agatha	ed7757ab8f	feat: add alembic for database migrations	2024-09-22 15:08:55 -04:00
agatha	7b55220589	feat: proxies are updated if already in database	2024-09-22 15:02:22 -04:00
agatha	5ac63e1fb5	feat: write proxies to database	2024-09-22 14:15:01 -04:00
agatha	d634e51cdf	fix __repr__	2024-09-22 14:13:06 -04:00
agatha	e124aea332	update .gitignore to ignore sqlite db	2024-09-22 14:11:50 -04:00
agatha	336cd61a8c	add init_db	2024-09-22 14:11:17 -04:00
agatha	d710dc723e	add database models and session factory	2024-09-22 13:34:50 -04:00
agatha	59a11c97df	chore: add .pylintrc	2024-09-22 13:34:01 -04:00