feat: add scrape tasks
This commit is contained in:
parent
b02793ae9b
commit
67089c570c
157
src/proxy_pool/worker/tasks_scrape.py
Normal file
157
src/proxy_pool/worker/tasks_scrape.py
Normal file
@ -0,0 +1,157 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker
|
||||
|
||||
from proxy_pool.plugins.protocols import Event
|
||||
from proxy_pool.plugins.registry import PluginRegistry
|
||||
from proxy_pool.proxy.models import Proxy, ProxySource, ProxyStatus
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def scrape_source(ctx: dict, source_id: str) -> dict:
|
||||
"""Scrape a single proxy source and upsert discovered proxies."""
|
||||
session_factory: async_sessionmaker = ctx["session_factory"]
|
||||
registry: PluginRegistry = ctx["registry"]
|
||||
settings = ctx["settings"]
|
||||
|
||||
async with session_factory() as db:
|
||||
source = await db.get(ProxySource, UUID(source_id))
|
||||
if source is None:
|
||||
logger.warning("Source %s not found, skipping", source_id)
|
||||
return {"status": "skipped", "reason": "not_found"}
|
||||
|
||||
if not source.is_active:
|
||||
return {"status": "skipped", "reason": "inactive"}
|
||||
|
||||
# Get the parser
|
||||
try:
|
||||
parser = registry.get_parser(source.parser_name)
|
||||
except Exception:
|
||||
logger.error(
|
||||
"No parser '%s' for source %s",
|
||||
source.parser_name,
|
||||
source_id,
|
||||
)
|
||||
await registry.emit(
|
||||
Event(
|
||||
type="source.failed",
|
||||
payload={
|
||||
"source_id": source_id,
|
||||
"error": f"Parser '{source.parser_name}' not found",
|
||||
},
|
||||
)
|
||||
)
|
||||
return {"status": "error", "reason": "parser_not_found"}
|
||||
|
||||
# Fetch the URL
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=settings.proxy.scrape_timeout_seconds,
|
||||
headers={"User-Agent": settings.proxy.scrape_user_agent},
|
||||
) as client:
|
||||
response = await client.get(str(source.url))
|
||||
response.raise_for_status()
|
||||
raw = response.content
|
||||
except httpx.HTTPError as err:
|
||||
logger.warning(
|
||||
"Failed to fetch source %s: %s",
|
||||
source.url,
|
||||
err,
|
||||
)
|
||||
await registry.emit(
|
||||
Event(
|
||||
type="source.failed",
|
||||
payload={"source_id": source_id, "error": str(err)},
|
||||
)
|
||||
)
|
||||
return {"status": "error", "reason": str(err)}
|
||||
|
||||
# Parse
|
||||
discovered = await parser.parse(
|
||||
raw=raw,
|
||||
source_url=str(source.url),
|
||||
source_id=source.id,
|
||||
default_protocol=source.default_protocol.value,
|
||||
)
|
||||
|
||||
if not discovered:
|
||||
logger.info("Source %s returned no proxies", source.url)
|
||||
return {"status": "ok", "new": 0, "total": 0}
|
||||
|
||||
# Upsert
|
||||
values = [
|
||||
{
|
||||
"ip": p.ip,
|
||||
"port": p.port,
|
||||
"protocol": p.protocol,
|
||||
"source_id": source.id,
|
||||
"status": ProxyStatus.UNCHECKED,
|
||||
}
|
||||
for p in discovered
|
||||
]
|
||||
|
||||
stmt = insert(Proxy).values(values)
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
index_elements=["ip", "port", "protocol"],
|
||||
set_={
|
||||
"source_id": stmt.excluded.source_id,
|
||||
},
|
||||
)
|
||||
await db.execute(stmt)
|
||||
|
||||
# Update source timestamp
|
||||
source.last_scraped_at = datetime.now()
|
||||
await db.commit()
|
||||
|
||||
count = len(discovered)
|
||||
logger.info(
|
||||
"Scraped source %s: %d proxies discovered",
|
||||
source.url,
|
||||
count,
|
||||
)
|
||||
|
||||
if count > 0:
|
||||
await registry.emit(
|
||||
Event(
|
||||
type="proxy.new_batch",
|
||||
payload={
|
||||
"source_id": source_id,
|
||||
"count": count,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return {"status": "ok", "total": count}
|
||||
|
||||
|
||||
async def scrape_all(ctx: dict) -> dict:
|
||||
"""Scrape all active sources sequentially."""
|
||||
session_factory: async_sessionmaker = ctx["session_factory"]
|
||||
|
||||
async with session_factory() as db:
|
||||
result = await db.execute(
|
||||
select(ProxySource).where(ProxySource.is_active.is_(True))
|
||||
)
|
||||
sources = result.scalars().all()
|
||||
source_ids = [str(s.id) for s in sources]
|
||||
|
||||
results = []
|
||||
for source_id in source_ids:
|
||||
result = await scrape_source(ctx, source_id)
|
||||
results.append({"source_id": source_id, **result})
|
||||
|
||||
total = sum(r.get("total", 0) for r in results)
|
||||
logger.info(
|
||||
"Scrape sweep complete: %d sources, %d total proxies",
|
||||
len(results),
|
||||
total,
|
||||
)
|
||||
return {"sources": len(results), "total_proxies": total}
|
||||
Loading…
x
Reference in New Issue
Block a user