Initial commit

2023-09-12 14:05:27 -04:00 · 2023-09-12 14:05:27 -04:00 · 9ec62c5d9b
commit 9ec62c5d9b
6 changed files with 208 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+.venv/
+.idea/
+__pycache__/
+*.py[cod]
--- a/README.md
+++ b/README.md
@ -0,0 +1,36 @@
+# Pastebin Client
+An exercise in Python package development and web scraping.
+
+Because Pastebin does not offer an API unless you have a Pro account, this package scrapes
+HTML for its data.
+
+The `PastebinAPI.get_public_paste_list` method does not download full paste text to avoid
+hammering the site. When the paste list is fetched, it will return a list of `Paste` objects
+with the following fields:
+- `title` - Paste title
+- `href` - URL of the paste
+- `lang` - Paste language
+- `fetched` - True if the full text and metadata have been fetched
+
+To fetch the full text and metadata of a paste, pass the paste as the argument to
+`PastebinAPI.get_paste`. This will return a `Paste` object with the following fields populated:
+- `author` - Paste author
+- `pub_date` - Publication date
+- `category` - Paste category
+- `text` - Full paste text
+
+This workflow will change once I figure out a decent method to create a full Pastebin Client
+class that manages an internal paste list.
+
+## Usage
+```shell
+git clone https://git.juggalol.com/agatha/pastebin-client
+cd pastebin-client/src
+pip install -r requirements.txt
+python main.py
+```
+
+Example in [src/main.py](src/main.py) simply fetches the public paste list and
+tests out the scraping functions.
+
+## Notes
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,22 @@
+from pastebin.client import PastebinAPI
+
+
+def main():
+    pastebin = PastebinAPI()
+
+    # Fetch public paste list
+    pastes = pastebin.get_public_paste_list()
+    for paste in pastes:
+        print(paste)
+
+    # Fetch full data from a few pastes to test methods
+    pastes[0] = pastebin.get_paste(pastes[0])
+    pastes[1] = pastebin.get_paste(pastes[1])
+
+    # Test filtering
+    fetched = list(filter(lambda x: x.fetched is True, pastes))
+    print(fetched)
+
+
+if __name__ == '__main__':
+    main()
--- a/src/pastebin/init.py
+++ b/src/pastebin/init.py
--- a/src/pastebin/client.py
+++ b/src/pastebin/client.py
@ -0,0 +1,144 @@
+"""
+This module provides a wrapper for interacting with Pastebin.
+
+The module contains the `PastebinAPI` class which offers convenient methods to access various
+endpoints of the API using HTTP requests. It supports custom headers and proxies for
+additional flexibility.
+"""
+import logging
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+from bs4 import BeautifulSoup
+
+
+logger = logging.getLogger()
+
+
+class ApiRequestException(Exception):
+    pass
+
+
+class Paste:
+    def __init__(self, title, href, lang):
+        self.title = title
+        self.href = href
+        self.lang = lang
+
+        self.fetched = False
+        self.author = None
+        self.pub_date = None
+        self.category = None
+        self.text = None
+
+    def __repr__(self):
+        return ("Paste("
+                f"title='{self.title}', "
+                f"href='{self.href}', "
+                f"lang='{self.lang}', "
+                f"fetched={self.fetched})"
+                )
+
+
+class PastebinAPI:
+    """
+    Wrapper class for interacting with Pastebin.
+
+    This class provides convenient methods to access various endpoints of the
+    API using HTTP requests. It supports custom headers and proxies for
+    additional flexibility.
+
+    Args:
+        headers (dict, optional): Dictionary containing customer headers to be sent with each request.
+        proxies (dict, optional): Dictionary containing proxy settings to be used for requests.
+        debug (bool, optional): Setting this to true will print `urllib3` debug information.
+    """
+    def __init__(self, headers=None, proxies=None, debug=False):
+        self.base_url = 'https://pastebin.com'
+        self.session = requests.Session()
+
+        retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
+        self.session.mount('http://', HTTPAdapter(max_retries=retries))
+        self.session.mount('https://', HTTPAdapter(max_retries=retries))
+
+        if headers:
+            self.session.headers.update(headers)
+        if proxies:
+            self.session.proxies.update(proxies)
+
+        if debug:
+            if not logger.handlers:
+                logging.basicConfig()
+
+            logging.getLogger("urllib3").setLevel(logging.DEBUG)
+
+    def __get(self, endpoint, params=None):
+        url = self.base_url + endpoint
+
+        try:
+            response = self.session.get(url, params=params)
+            response.raise_for_status()
+        except requests.RequestException as e:
+            raise ApiRequestException(f'Failed to execute GET request: {str(e)}')
+
+        return response
+
+    def get_public_paste_list(self):
+        endpoint = '/archive'
+
+        try:
+            response = self.__get(endpoint)
+        except ApiRequestException as e:
+            logger.warning(e)
+            return None
+
+        soup = BeautifulSoup(response.text, 'html.parser')
+        pastes_table = soup.select('div[data-key]')
+
+        pastes = [self._parse_paste(paste_html) for paste_html in pastes_table]
+
+        return pastes
+
+    def get_paste(self, paste):
+        endpoint = f'/{paste.href}'
+
+        try:
+            response = self.__get(endpoint)
+        except ApiRequestException as e:
+            logger.warning(e)
+            return None
+
+        # Get paste metadata
+        soup = BeautifulSoup(response.text, 'html.parser')
+        paste = self._parse_paste_metadata(soup, paste)
+
+        # Get paste text
+        paste = self._fetch_paste_text(paste)
+
+        return paste
+
+    def _parse_paste(self, paste_html):
+        paste_title = paste_html.find('a').text
+        paste_link = paste_html.find('a')['href'].lstrip('/')
+        paste_lang = paste_html.find_all('a')[-1].text
+
+        return Paste(href=paste_link, lang=paste_lang, title=paste_title)
+
+    def _parse_paste_metadata(self, soup, paste):
+        paste.author = soup.find('div', class_='username').a.text
+        paste.pub_date = soup.find('div', class_='date').span.text
+        paste.category = soup.find('span', title='Category').text.split('|')[1].strip()
+
+        return paste
+
+    def _fetch_paste_text(self, paste):
+        endpoint = f'/raw/{paste.href}'
+        try:
+            response = self.__get(endpoint)
+        except ApiRequestException as e:
+            logger.warning(f'Could not fetch paste text: {str(e)}')
+        else:
+            paste.text = response.text
+            paste.fetched = True
+
+        return paste
--- a/src/requirements.txt
+++ b/src/requirements.txt
@ -0,0 +1,2 @@
+requests
+beautifulsoup4