Initial commit
This commit is contained in:
commit
9ec62c5d9b
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
.venv/
|
||||||
|
.idea/
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
36
README.md
Normal file
36
README.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# Pastebin Client
|
||||||
|
An exercise in Python package development and web scraping.
|
||||||
|
|
||||||
|
Because Pastebin does not offer an API unless you have a Pro account, this package scrapes
|
||||||
|
HTML for its data.
|
||||||
|
|
||||||
|
The `PastebinAPI.get_public_paste_list` method does not download full paste text to avoid
|
||||||
|
hammering the site. When the paste list is fetched, it will return a list of `Paste` objects
|
||||||
|
with the following fields:
|
||||||
|
- `title` - Paste title
|
||||||
|
- `href` - URL of the paste
|
||||||
|
- `lang` - Paste language
|
||||||
|
- `fetched` - True if the full text and metadata have been fetched
|
||||||
|
|
||||||
|
To fetch the full text and metadata of a paste, pass the paste as the argument to
|
||||||
|
`PastebinAPI.get_paste`. This will return a `Paste` object with the following fields populated:
|
||||||
|
- `author` - Paste author
|
||||||
|
- `pub_date` - Publication date
|
||||||
|
- `category` - Paste category
|
||||||
|
- `text` - Full paste text
|
||||||
|
|
||||||
|
This workflow will change once I figure out a decent method to create a full Pastebin Client
|
||||||
|
class that manages an internal paste list.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
```shell
|
||||||
|
git clone https://git.juggalol.com/agatha/pastebin-client
|
||||||
|
cd pastebin-client/src
|
||||||
|
pip install -r requirements.txt
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Example in [src/main.py](src/main.py) simply fetches the public paste list and
|
||||||
|
tests out the scraping functions.
|
||||||
|
|
||||||
|
## Notes
|
22
src/main.py
Normal file
22
src/main.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from pastebin.client import PastebinAPI
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pastebin = PastebinAPI()
|
||||||
|
|
||||||
|
# Fetch public paste list
|
||||||
|
pastes = pastebin.get_public_paste_list()
|
||||||
|
for paste in pastes:
|
||||||
|
print(paste)
|
||||||
|
|
||||||
|
# Fetch full data from a few pastes to test methods
|
||||||
|
pastes[0] = pastebin.get_paste(pastes[0])
|
||||||
|
pastes[1] = pastebin.get_paste(pastes[1])
|
||||||
|
|
||||||
|
# Test filtering
|
||||||
|
fetched = list(filter(lambda x: x.fetched is True, pastes))
|
||||||
|
print(fetched)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
0
src/pastebin/__init__.py
Normal file
0
src/pastebin/__init__.py
Normal file
144
src/pastebin/client.py
Normal file
144
src/pastebin/client.py
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
"""
|
||||||
|
This module provides a wrapper for interacting with Pastebin.
|
||||||
|
|
||||||
|
The module contains the `PastebinAPI` class which offers convenient methods to access various
|
||||||
|
endpoints of the API using HTTP requests. It supports custom headers and proxies for
|
||||||
|
additional flexibility.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from requests.packages.urllib3.util.retry import Retry
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
|
class ApiRequestException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Paste:
|
||||||
|
def __init__(self, title, href, lang):
|
||||||
|
self.title = title
|
||||||
|
self.href = href
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
|
self.fetched = False
|
||||||
|
self.author = None
|
||||||
|
self.pub_date = None
|
||||||
|
self.category = None
|
||||||
|
self.text = None
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ("Paste("
|
||||||
|
f"title='{self.title}', "
|
||||||
|
f"href='{self.href}', "
|
||||||
|
f"lang='{self.lang}', "
|
||||||
|
f"fetched={self.fetched})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PastebinAPI:
|
||||||
|
"""
|
||||||
|
Wrapper class for interacting with Pastebin.
|
||||||
|
|
||||||
|
This class provides convenient methods to access various endpoints of the
|
||||||
|
API using HTTP requests. It supports custom headers and proxies for
|
||||||
|
additional flexibility.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
headers (dict, optional): Dictionary containing customer headers to be sent with each request.
|
||||||
|
proxies (dict, optional): Dictionary containing proxy settings to be used for requests.
|
||||||
|
debug (bool, optional): Setting this to true will print `urllib3` debug information.
|
||||||
|
"""
|
||||||
|
def __init__(self, headers=None, proxies=None, debug=False):
|
||||||
|
self.base_url = 'https://pastebin.com'
|
||||||
|
self.session = requests.Session()
|
||||||
|
|
||||||
|
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
||||||
|
self.session.mount('http://', HTTPAdapter(max_retries=retries))
|
||||||
|
self.session.mount('https://', HTTPAdapter(max_retries=retries))
|
||||||
|
|
||||||
|
if headers:
|
||||||
|
self.session.headers.update(headers)
|
||||||
|
if proxies:
|
||||||
|
self.session.proxies.update(proxies)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
if not logger.handlers:
|
||||||
|
logging.basicConfig()
|
||||||
|
|
||||||
|
logging.getLogger("urllib3").setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
def __get(self, endpoint, params=None):
|
||||||
|
url = self.base_url + endpoint
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.session.get(url, params=params)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.RequestException as e:
|
||||||
|
raise ApiRequestException(f'Failed to execute GET request: {str(e)}')
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
def get_public_paste_list(self):
|
||||||
|
endpoint = '/archive'
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.__get(endpoint)
|
||||||
|
except ApiRequestException as e:
|
||||||
|
logger.warning(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
pastes_table = soup.select('div[data-key]')
|
||||||
|
|
||||||
|
pastes = [self._parse_paste(paste_html) for paste_html in pastes_table]
|
||||||
|
|
||||||
|
return pastes
|
||||||
|
|
||||||
|
def get_paste(self, paste):
|
||||||
|
endpoint = f'/{paste.href}'
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.__get(endpoint)
|
||||||
|
except ApiRequestException as e:
|
||||||
|
logger.warning(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Get paste metadata
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
paste = self._parse_paste_metadata(soup, paste)
|
||||||
|
|
||||||
|
# Get paste text
|
||||||
|
paste = self._fetch_paste_text(paste)
|
||||||
|
|
||||||
|
return paste
|
||||||
|
|
||||||
|
def _parse_paste(self, paste_html):
|
||||||
|
paste_title = paste_html.find('a').text
|
||||||
|
paste_link = paste_html.find('a')['href'].lstrip('/')
|
||||||
|
paste_lang = paste_html.find_all('a')[-1].text
|
||||||
|
|
||||||
|
return Paste(href=paste_link, lang=paste_lang, title=paste_title)
|
||||||
|
|
||||||
|
def _parse_paste_metadata(self, soup, paste):
|
||||||
|
paste.author = soup.find('div', class_='username').a.text
|
||||||
|
paste.pub_date = soup.find('div', class_='date').span.text
|
||||||
|
paste.category = soup.find('span', title='Category').text.split('|')[1].strip()
|
||||||
|
|
||||||
|
return paste
|
||||||
|
|
||||||
|
def _fetch_paste_text(self, paste):
|
||||||
|
endpoint = f'/raw/{paste.href}'
|
||||||
|
try:
|
||||||
|
response = self.__get(endpoint)
|
||||||
|
except ApiRequestException as e:
|
||||||
|
logger.warning(f'Could not fetch paste text: {str(e)}')
|
||||||
|
else:
|
||||||
|
paste.text = response.text
|
||||||
|
paste.fetched = True
|
||||||
|
|
||||||
|
return paste
|
2
src/requirements.txt
Normal file
2
src/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
requests
|
||||||
|
beautifulsoup4
|
Loading…
Reference in New Issue
Block a user