Initial commit

This commit is contained in:
agatha 2023-09-12 14:05:27 -04:00
commit 9ec62c5d9b
6 changed files with 208 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
.venv/
.idea/
__pycache__/
*.py[cod]

36
README.md Normal file
View File

@ -0,0 +1,36 @@
# Pastebin Client
An exercise in Python package development and web scraping.
Because Pastebin does not offer an API unless you have a Pro account, this package scrapes
HTML for its data.
The `PastebinAPI.get_public_paste_list` method does not download full paste text to avoid
hammering the site. When the paste list is fetched, it will return a list of `Paste` objects
with the following fields:
- `title` - Paste title
- `href` - URL of the paste
- `lang` - Paste language
- `fetched` - True if the full text and metadata have been fetched
To fetch the full text and metadata of a paste, pass the paste as the argument to
`PastebinAPI.get_paste`. This will return a `Paste` object with the following fields populated:
- `author` - Paste author
- `pub_date` - Publication date
- `category` - Paste category
- `text` - Full paste text
This workflow will change once I figure out a decent method to create a full Pastebin Client
class that manages an internal paste list.
## Usage
```shell
git clone https://git.juggalol.com/agatha/pastebin-client
cd pastebin-client/src
pip install -r requirements.txt
python main.py
```
Example in [src/main.py](src/main.py) simply fetches the public paste list and
tests out the scraping functions.
## Notes

22
src/main.py Normal file
View File

@ -0,0 +1,22 @@
from pastebin.client import PastebinAPI
def main():
pastebin = PastebinAPI()
# Fetch public paste list
pastes = pastebin.get_public_paste_list()
for paste in pastes:
print(paste)
# Fetch full data from a few pastes to test methods
pastes[0] = pastebin.get_paste(pastes[0])
pastes[1] = pastebin.get_paste(pastes[1])
# Test filtering
fetched = list(filter(lambda x: x.fetched is True, pastes))
print(fetched)
if __name__ == '__main__':
main()

0
src/pastebin/__init__.py Normal file
View File

144
src/pastebin/client.py Normal file
View File

@ -0,0 +1,144 @@
"""
This module provides a wrapper for interacting with Pastebin.
The module contains the `PastebinAPI` class which offers convenient methods to access various
endpoints of the API using HTTP requests. It supports custom headers and proxies for
additional flexibility.
"""
import logging
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
logger = logging.getLogger()
class ApiRequestException(Exception):
pass
class Paste:
def __init__(self, title, href, lang):
self.title = title
self.href = href
self.lang = lang
self.fetched = False
self.author = None
self.pub_date = None
self.category = None
self.text = None
def __repr__(self):
return ("Paste("
f"title='{self.title}', "
f"href='{self.href}', "
f"lang='{self.lang}', "
f"fetched={self.fetched})"
)
class PastebinAPI:
"""
Wrapper class for interacting with Pastebin.
This class provides convenient methods to access various endpoints of the
API using HTTP requests. It supports custom headers and proxies for
additional flexibility.
Args:
headers (dict, optional): Dictionary containing customer headers to be sent with each request.
proxies (dict, optional): Dictionary containing proxy settings to be used for requests.
debug (bool, optional): Setting this to true will print `urllib3` debug information.
"""
def __init__(self, headers=None, proxies=None, debug=False):
self.base_url = 'https://pastebin.com'
self.session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
self.session.mount('http://', HTTPAdapter(max_retries=retries))
self.session.mount('https://', HTTPAdapter(max_retries=retries))
if headers:
self.session.headers.update(headers)
if proxies:
self.session.proxies.update(proxies)
if debug:
if not logger.handlers:
logging.basicConfig()
logging.getLogger("urllib3").setLevel(logging.DEBUG)
def __get(self, endpoint, params=None):
url = self.base_url + endpoint
try:
response = self.session.get(url, params=params)
response.raise_for_status()
except requests.RequestException as e:
raise ApiRequestException(f'Failed to execute GET request: {str(e)}')
return response
def get_public_paste_list(self):
endpoint = '/archive'
try:
response = self.__get(endpoint)
except ApiRequestException as e:
logger.warning(e)
return None
soup = BeautifulSoup(response.text, 'html.parser')
pastes_table = soup.select('div[data-key]')
pastes = [self._parse_paste(paste_html) for paste_html in pastes_table]
return pastes
def get_paste(self, paste):
endpoint = f'/{paste.href}'
try:
response = self.__get(endpoint)
except ApiRequestException as e:
logger.warning(e)
return None
# Get paste metadata
soup = BeautifulSoup(response.text, 'html.parser')
paste = self._parse_paste_metadata(soup, paste)
# Get paste text
paste = self._fetch_paste_text(paste)
return paste
def _parse_paste(self, paste_html):
paste_title = paste_html.find('a').text
paste_link = paste_html.find('a')['href'].lstrip('/')
paste_lang = paste_html.find_all('a')[-1].text
return Paste(href=paste_link, lang=paste_lang, title=paste_title)
def _parse_paste_metadata(self, soup, paste):
paste.author = soup.find('div', class_='username').a.text
paste.pub_date = soup.find('div', class_='date').span.text
paste.category = soup.find('span', title='Category').text.split('|')[1].strip()
return paste
def _fetch_paste_text(self, paste):
endpoint = f'/raw/{paste.href}'
try:
response = self.__get(endpoint)
except ApiRequestException as e:
logger.warning(f'Could not fetch paste text: {str(e)}')
else:
paste.text = response.text
paste.fetched = True
return paste

2
src/requirements.txt Normal file
View File

@ -0,0 +1,2 @@
requests
beautifulsoup4