Initial commit
This commit is contained in:
commit
9ec62c5d9b
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
.venv/
|
||||
.idea/
|
||||
__pycache__/
|
||||
*.py[cod]
|
36
README.md
Normal file
36
README.md
Normal file
@ -0,0 +1,36 @@
|
||||
# Pastebin Client
|
||||
An exercise in Python package development and web scraping.
|
||||
|
||||
Because Pastebin does not offer an API unless you have a Pro account, this package scrapes
|
||||
HTML for its data.
|
||||
|
||||
The `PastebinAPI.get_public_paste_list` method does not download full paste text to avoid
|
||||
hammering the site. When the paste list is fetched, it will return a list of `Paste` objects
|
||||
with the following fields:
|
||||
- `title` - Paste title
|
||||
- `href` - URL of the paste
|
||||
- `lang` - Paste language
|
||||
- `fetched` - True if the full text and metadata have been fetched
|
||||
|
||||
To fetch the full text and metadata of a paste, pass the paste as the argument to
|
||||
`PastebinAPI.get_paste`. This will return a `Paste` object with the following fields populated:
|
||||
- `author` - Paste author
|
||||
- `pub_date` - Publication date
|
||||
- `category` - Paste category
|
||||
- `text` - Full paste text
|
||||
|
||||
This workflow will change once I figure out a decent method to create a full Pastebin Client
|
||||
class that manages an internal paste list.
|
||||
|
||||
## Usage
|
||||
```shell
|
||||
git clone https://git.juggalol.com/agatha/pastebin-client
|
||||
cd pastebin-client/src
|
||||
pip install -r requirements.txt
|
||||
python main.py
|
||||
```
|
||||
|
||||
Example in [src/main.py](src/main.py) simply fetches the public paste list and
|
||||
tests out the scraping functions.
|
||||
|
||||
## Notes
|
22
src/main.py
Normal file
22
src/main.py
Normal file
@ -0,0 +1,22 @@
|
||||
from pastebin.client import PastebinAPI
|
||||
|
||||
|
||||
def main():
|
||||
pastebin = PastebinAPI()
|
||||
|
||||
# Fetch public paste list
|
||||
pastes = pastebin.get_public_paste_list()
|
||||
for paste in pastes:
|
||||
print(paste)
|
||||
|
||||
# Fetch full data from a few pastes to test methods
|
||||
pastes[0] = pastebin.get_paste(pastes[0])
|
||||
pastes[1] = pastebin.get_paste(pastes[1])
|
||||
|
||||
# Test filtering
|
||||
fetched = list(filter(lambda x: x.fetched is True, pastes))
|
||||
print(fetched)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
0
src/pastebin/__init__.py
Normal file
0
src/pastebin/__init__.py
Normal file
144
src/pastebin/client.py
Normal file
144
src/pastebin/client.py
Normal file
@ -0,0 +1,144 @@
|
||||
"""
|
||||
This module provides a wrapper for interacting with Pastebin.
|
||||
|
||||
The module contains the `PastebinAPI` class which offers convenient methods to access various
|
||||
endpoints of the API using HTTP requests. It supports custom headers and proxies for
|
||||
additional flexibility.
|
||||
"""
|
||||
import logging
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
class ApiRequestException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Paste:
|
||||
def __init__(self, title, href, lang):
|
||||
self.title = title
|
||||
self.href = href
|
||||
self.lang = lang
|
||||
|
||||
self.fetched = False
|
||||
self.author = None
|
||||
self.pub_date = None
|
||||
self.category = None
|
||||
self.text = None
|
||||
|
||||
def __repr__(self):
|
||||
return ("Paste("
|
||||
f"title='{self.title}', "
|
||||
f"href='{self.href}', "
|
||||
f"lang='{self.lang}', "
|
||||
f"fetched={self.fetched})"
|
||||
)
|
||||
|
||||
|
||||
class PastebinAPI:
|
||||
"""
|
||||
Wrapper class for interacting with Pastebin.
|
||||
|
||||
This class provides convenient methods to access various endpoints of the
|
||||
API using HTTP requests. It supports custom headers and proxies for
|
||||
additional flexibility.
|
||||
|
||||
Args:
|
||||
headers (dict, optional): Dictionary containing customer headers to be sent with each request.
|
||||
proxies (dict, optional): Dictionary containing proxy settings to be used for requests.
|
||||
debug (bool, optional): Setting this to true will print `urllib3` debug information.
|
||||
"""
|
||||
def __init__(self, headers=None, proxies=None, debug=False):
|
||||
self.base_url = 'https://pastebin.com'
|
||||
self.session = requests.Session()
|
||||
|
||||
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
||||
self.session.mount('http://', HTTPAdapter(max_retries=retries))
|
||||
self.session.mount('https://', HTTPAdapter(max_retries=retries))
|
||||
|
||||
if headers:
|
||||
self.session.headers.update(headers)
|
||||
if proxies:
|
||||
self.session.proxies.update(proxies)
|
||||
|
||||
if debug:
|
||||
if not logger.handlers:
|
||||
logging.basicConfig()
|
||||
|
||||
logging.getLogger("urllib3").setLevel(logging.DEBUG)
|
||||
|
||||
def __get(self, endpoint, params=None):
|
||||
url = self.base_url + endpoint
|
||||
|
||||
try:
|
||||
response = self.session.get(url, params=params)
|
||||
response.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
raise ApiRequestException(f'Failed to execute GET request: {str(e)}')
|
||||
|
||||
return response
|
||||
|
||||
def get_public_paste_list(self):
|
||||
endpoint = '/archive'
|
||||
|
||||
try:
|
||||
response = self.__get(endpoint)
|
||||
except ApiRequestException as e:
|
||||
logger.warning(e)
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
pastes_table = soup.select('div[data-key]')
|
||||
|
||||
pastes = [self._parse_paste(paste_html) for paste_html in pastes_table]
|
||||
|
||||
return pastes
|
||||
|
||||
def get_paste(self, paste):
|
||||
endpoint = f'/{paste.href}'
|
||||
|
||||
try:
|
||||
response = self.__get(endpoint)
|
||||
except ApiRequestException as e:
|
||||
logger.warning(e)
|
||||
return None
|
||||
|
||||
# Get paste metadata
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
paste = self._parse_paste_metadata(soup, paste)
|
||||
|
||||
# Get paste text
|
||||
paste = self._fetch_paste_text(paste)
|
||||
|
||||
return paste
|
||||
|
||||
def _parse_paste(self, paste_html):
|
||||
paste_title = paste_html.find('a').text
|
||||
paste_link = paste_html.find('a')['href'].lstrip('/')
|
||||
paste_lang = paste_html.find_all('a')[-1].text
|
||||
|
||||
return Paste(href=paste_link, lang=paste_lang, title=paste_title)
|
||||
|
||||
def _parse_paste_metadata(self, soup, paste):
|
||||
paste.author = soup.find('div', class_='username').a.text
|
||||
paste.pub_date = soup.find('div', class_='date').span.text
|
||||
paste.category = soup.find('span', title='Category').text.split('|')[1].strip()
|
||||
|
||||
return paste
|
||||
|
||||
def _fetch_paste_text(self, paste):
|
||||
endpoint = f'/raw/{paste.href}'
|
||||
try:
|
||||
response = self.__get(endpoint)
|
||||
except ApiRequestException as e:
|
||||
logger.warning(f'Could not fetch paste text: {str(e)}')
|
||||
else:
|
||||
paste.text = response.text
|
||||
paste.fetched = True
|
||||
|
||||
return paste
|
2
src/requirements.txt
Normal file
2
src/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
requests
|
||||
beautifulsoup4
|
Loading…
Reference in New Issue
Block a user