From 9ec62c5d9bbd9a09757938ae74b497dae12a98a9 Mon Sep 17 00:00:00 2001 From: agatha Date: Tue, 12 Sep 2023 14:05:27 -0400 Subject: [PATCH] Initial commit --- .gitignore | 4 ++ README.md | 36 ++++++++++ src/main.py | 22 ++++++ src/pastebin/__init__.py | 0 src/pastebin/client.py | 144 +++++++++++++++++++++++++++++++++++++++ src/requirements.txt | 2 + 6 files changed, 208 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 src/main.py create mode 100644 src/pastebin/__init__.py create mode 100644 src/pastebin/client.py create mode 100644 src/requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f08bbdc --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.venv/ +.idea/ +__pycache__/ +*.py[cod] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..700faca --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# Pastebin Client +An exercise in Python package development and web scraping. + +Because Pastebin does not offer an API unless you have a Pro account, this package scrapes +HTML for its data. + +The `PastebinAPI.get_public_paste_list` method does not download full paste text to avoid +hammering the site. When the paste list is fetched, it will return a list of `Paste` objects +with the following fields: +- `title` - Paste title +- `href` - URL of the paste +- `lang` - Paste language +- `fetched` - True if the full text and metadata have been fetched + +To fetch the full text and metadata of a paste, pass the paste as the argument to +`PastebinAPI.get_paste`. This will return a `Paste` object with the following fields populated: +- `author` - Paste author +- `pub_date` - Publication date +- `category` - Paste category +- `text` - Full paste text + +This workflow will change once I figure out a decent method to create a full Pastebin Client +class that manages an internal paste list. + +## Usage +```shell +git clone https://git.juggalol.com/agatha/pastebin-client +cd pastebin-client/src +pip install -r requirements.txt +python main.py +``` + +Example in [src/main.py](src/main.py) simply fetches the public paste list and +tests out the scraping functions. + +## Notes diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..f5231d4 --- /dev/null +++ b/src/main.py @@ -0,0 +1,22 @@ +from pastebin.client import PastebinAPI + + +def main(): + pastebin = PastebinAPI() + + # Fetch public paste list + pastes = pastebin.get_public_paste_list() + for paste in pastes: + print(paste) + + # Fetch full data from a few pastes to test methods + pastes[0] = pastebin.get_paste(pastes[0]) + pastes[1] = pastebin.get_paste(pastes[1]) + + # Test filtering + fetched = list(filter(lambda x: x.fetched is True, pastes)) + print(fetched) + + +if __name__ == '__main__': + main() diff --git a/src/pastebin/__init__.py b/src/pastebin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pastebin/client.py b/src/pastebin/client.py new file mode 100644 index 0000000..fcd00eb --- /dev/null +++ b/src/pastebin/client.py @@ -0,0 +1,144 @@ +""" +This module provides a wrapper for interacting with Pastebin. + +The module contains the `PastebinAPI` class which offers convenient methods to access various +endpoints of the API using HTTP requests. It supports custom headers and proxies for +additional flexibility. +""" +import logging +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry +from bs4 import BeautifulSoup + + +logger = logging.getLogger() + + +class ApiRequestException(Exception): + pass + + +class Paste: + def __init__(self, title, href, lang): + self.title = title + self.href = href + self.lang = lang + + self.fetched = False + self.author = None + self.pub_date = None + self.category = None + self.text = None + + def __repr__(self): + return ("Paste(" + f"title='{self.title}', " + f"href='{self.href}', " + f"lang='{self.lang}', " + f"fetched={self.fetched})" + ) + + +class PastebinAPI: + """ + Wrapper class for interacting with Pastebin. + + This class provides convenient methods to access various endpoints of the + API using HTTP requests. It supports custom headers and proxies for + additional flexibility. + + Args: + headers (dict, optional): Dictionary containing customer headers to be sent with each request. + proxies (dict, optional): Dictionary containing proxy settings to be used for requests. + debug (bool, optional): Setting this to true will print `urllib3` debug information. + """ + def __init__(self, headers=None, proxies=None, debug=False): + self.base_url = 'https://pastebin.com' + self.session = requests.Session() + + retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + self.session.mount('http://', HTTPAdapter(max_retries=retries)) + self.session.mount('https://', HTTPAdapter(max_retries=retries)) + + if headers: + self.session.headers.update(headers) + if proxies: + self.session.proxies.update(proxies) + + if debug: + if not logger.handlers: + logging.basicConfig() + + logging.getLogger("urllib3").setLevel(logging.DEBUG) + + def __get(self, endpoint, params=None): + url = self.base_url + endpoint + + try: + response = self.session.get(url, params=params) + response.raise_for_status() + except requests.RequestException as e: + raise ApiRequestException(f'Failed to execute GET request: {str(e)}') + + return response + + def get_public_paste_list(self): + endpoint = '/archive' + + try: + response = self.__get(endpoint) + except ApiRequestException as e: + logger.warning(e) + return None + + soup = BeautifulSoup(response.text, 'html.parser') + pastes_table = soup.select('div[data-key]') + + pastes = [self._parse_paste(paste_html) for paste_html in pastes_table] + + return pastes + + def get_paste(self, paste): + endpoint = f'/{paste.href}' + + try: + response = self.__get(endpoint) + except ApiRequestException as e: + logger.warning(e) + return None + + # Get paste metadata + soup = BeautifulSoup(response.text, 'html.parser') + paste = self._parse_paste_metadata(soup, paste) + + # Get paste text + paste = self._fetch_paste_text(paste) + + return paste + + def _parse_paste(self, paste_html): + paste_title = paste_html.find('a').text + paste_link = paste_html.find('a')['href'].lstrip('/') + paste_lang = paste_html.find_all('a')[-1].text + + return Paste(href=paste_link, lang=paste_lang, title=paste_title) + + def _parse_paste_metadata(self, soup, paste): + paste.author = soup.find('div', class_='username').a.text + paste.pub_date = soup.find('div', class_='date').span.text + paste.category = soup.find('span', title='Category').text.split('|')[1].strip() + + return paste + + def _fetch_paste_text(self, paste): + endpoint = f'/raw/{paste.href}' + try: + response = self.__get(endpoint) + except ApiRequestException as e: + logger.warning(f'Could not fetch paste text: {str(e)}') + else: + paste.text = response.text + paste.fetched = True + + return paste diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..a98ae43 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,2 @@ +requests +beautifulsoup4 \ No newline at end of file