feat: add manual scrape trigger endpoint

This commit is contained in:
agatha 2026-03-15 16:21:58 -04:00
parent 63b480ba9a
commit 51f6cfb4b4

View File

@ -1,10 +1,13 @@
from datetime import datetime
from uuid import UUID from uuid import UUID
import httpx
from fastapi import APIRouter, Depends, HTTPException, status from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from proxy_pool.common.dependencies import get_db, get_registry from proxy_pool.common.dependencies import get_db, get_registry
from proxy_pool.config import get_settings
from proxy_pool.plugins.registry import PluginRegistry from proxy_pool.plugins.registry import PluginRegistry
from proxy_pool.proxy.models import Proxy, ProxySource from proxy_pool.proxy.models import Proxy, ProxySource
from proxy_pool.proxy.schemas import ( from proxy_pool.proxy.schemas import (
@ -131,6 +134,81 @@ async def delete_source(
await db.commit() await db.commit()
@router.post("/{source_id}/scrape")
async def trigger_scrape(
source_id: UUID,
db: AsyncSession = Depends(get_db),
registry: PluginRegistry = Depends(get_registry),
):
source = await db.get(ProxySource, source_id)
if source is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Source not found",
) from None
try:
parser = registry.get_parser(source.parser_name)
except Exception:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail=f"Parser '{source.parser_name}' not registered",
) from None
settings = get_settings()
try:
async with httpx.AsyncClient(
timeout=settings.proxy.scrape_timeout_seconds,
headers={"User-Agent": settings.proxy.scrape_user_agent},
) as client:
response = await client.get(str(source.url))
response.raise_for_status()
except httpx.HTTPError as err:
raise HTTPException(
status_code=status.HTTP_502_BAD_GATEWAY,
detail=f"Failed to fetch source: {err}",
) from None
discovered = await parser.parse(
raw=response.content,
source_url=str(source.url),
source_id=source.id,
default_protocol=source.default_protocol.value,
)
if discovered:
from sqlalchemy.dialects.postgresql import insert as pg_insert
from proxy_pool.proxy.models import Proxy, ProxyStatus
values = [
{
"ip": p.ip,
"port": p.port,
"protocol": p.protocol,
"source_id": source.id,
"status": ProxyStatus.UNCHECKED,
}
for p in discovered
]
stmt = pg_insert(Proxy).values(values)
stmt = stmt.on_conflict_do_update(
index_elements=["ip", "port", "protocol"],
set_={"source_id": stmt.excluded.source_id},
)
await db.execute(stmt)
source.last_scraped_at = datetime.now()
await db.commit()
return {
"source_id": str(source.id),
"proxies_discovered": len(discovered),
}
@proxy_router.get("", response_model=ProxyListResponse) @proxy_router.get("", response_model=ProxyListResponse)
async def list_proxies( async def list_proxies(
params: ProxyListParams = Depends(), params: ProxyListParams = Depends(),