diff options
author | Your Name | 2024-04-18 11:21:19 +0330 |
---|---|---|
committer | Your Name | 2024-04-18 11:21:19 +0330 |
commit | 93193a3f808a7e170a0e1ef6b999bddf7c8a0371 (patch) | |
tree | e642c0c94199851d96c0de7066f1a023f46a2fab | |
parent | ffab0040d89537e1db5b9c975afb8e14b9514920 (diff) | |
download | aur-93193a3f808a7e170a0e1ef6b999bddf7c8a0371.tar.gz |
web2pdf v2
-rw-r--r-- | .SRCINFO | 4 | ||||
-rw-r--r-- | PKGBUILD | 4 | ||||
-rw-r--r-- | README.md | 75 | ||||
-rw-r--r-- | web2pdf-1.0-1-any.pkg.tar.zst | bin | 17891 -> 0 bytes | |||
-rw-r--r-- | web2pdf.py | 65 |
5 files changed, 103 insertions, 45 deletions
@@ -1,6 +1,6 @@ pkgbase = web2pdf pkgdesc = A CLI tool to extract a part of a website, create a PDF - pkgver = 1.0 + pkgver = 2.0 pkgrel = 1 url = https://github.com/simit22/web2pdf arch = any @@ -8,7 +8,7 @@ pkgbase = web2pdf depends = python depends = python-requests depends = python-beautifulsoup4 - depends = python-reportlab + depends = python-weasyprint source = web2pdf.py md5sums = SKIP @@ -1,12 +1,12 @@ # Maintainer: Your Name <your_email@example.com> pkgname=web2pdf -pkgver=1.0 +pkgver=2.0 pkgrel=1 pkgdesc="A CLI tool to extract a part of a website, create a PDF" arch=('any') url="https://github.com/simit22/web2pdf" license=('GPL') -depends=('python' 'python-requests' 'python-beautifulsoup4' 'python-reportlab') +depends=('python' 'python-requests' 'python-beautifulsoup4' 'python-weasyprint') source=("web2pdf.py") md5sums=('SKIP') diff --git a/README.md b/README.md index c0dc6aace1b2..4851ac7c2919 100644 --- a/README.md +++ b/README.md @@ -1 +1,76 @@ # web2pdf + +A CLI tool to extract a part of a website, create a PDF + +(new version now support both class and id , even some styling ) + +------ + +> let me tell u a really fun story +> i am a manga and lightnovel fan +> +> lightnovels r novels that never become a book +> r really better than many books +> they rarely become even pdf +> some of them even dont get translated to english and r still korean +> +> so i wanted to read them as pdf and doing it manually is really hard and boring +> +> so lets go to point +> i wrote my own tools to do so -------- web2pdf + +it is totally cool u just give it + +1. **web page** +2. **the part that contain novel or anything ( id or class )** + +and it do the job +it make all of it to a ***perfect pdf*** + +it is called **web2pdf** + + + +# how to install + +if u use arch linux btw + +`yay -S web2pdf` + +available in aur + + https://aur.archlinux.org/packages/web2pdf + +if u wanna compile it yourself + +1. clone repository + +2. go to cloned file + +3. go to venv using + `cd ./bin/ & source activate` + +4. install dependancies + + `pip install requests beautifulsoup4 reportlab` + +5. run and enjoy using python + + `python web2pdf.py` + +6. u can even make build its binary yourself it is easy + + + +# what to do next + +- [ ] maybe adding translation ability to cli tools + + + +# end ?! + +in the end i will be happy if u share your ideas about this script with me + +TY so much ❤️ + diff --git a/web2pdf-1.0-1-any.pkg.tar.zst b/web2pdf-1.0-1-any.pkg.tar.zst Binary files differdeleted file mode 100644 index 10f83fb21078..000000000000 --- a/web2pdf-1.0-1-any.pkg.tar.zst +++ /dev/null diff --git a/web2pdf.py b/web2pdf.py index 9b3759e4383a..a22c70fb7e56 100644 --- a/web2pdf.py +++ b/web2pdf.py @@ -1,10 +1,9 @@ import argparse import requests from bs4 import BeautifulSoup -from reportlab.pdfgen import canvas -from reportlab.lib.pagesizes import letter +import weasyprint -def save_content_as_pdf(url, id, pdf_name): +def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes=None): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } @@ -12,53 +11,37 @@ def save_content_as_pdf(url, id, pdf_name): response = requests.get(url, headers) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') - novel_content = soup.find(id=id) - if novel_content: + + if id: + content = soup.find(id=id) + elif class_name: + content = soup.find(class_=class_name) + else: + content = soup.body + + if content: + if exclude_classes: + for exclude_class in exclude_classes: + elements = content.find_all(class_=exclude_class) + for element in elements: + element.decompose() + pdf_filename = f"{pdf_name}.pdf" - c = canvas.Canvas(pdf_filename, pagesize=letter) - c.setFont("Helvetica", 12) - - # Define padding values - top_padding = 30 - left_padding = 30 - right_padding = 30 - bottom_padding = 30 - - content_text = novel_content.get_text(separator='\n') - - lines = content_text.split('\n') - - y_position = 800 - top_padding - for line in lines: - if y_position < bottom_padding: - c.showPage() - y_position = 800 - top_padding - - words = line.split() - wrapped_line = '' - for word in words: - if c.stringWidth(wrapped_line + ' ' + word) < 500 - left_padding - right_padding: - wrapped_line += ' ' + word - else: - c.drawString(left_padding, y_position, wrapped_line.strip()) - y_position -= 20 - wrapped_line = word - if wrapped_line: - c.drawString(left_padding, y_position, wrapped_line.strip()) - y_position -= 20 - - c.save() + html_content = str(content) + pdf = weasyprint.HTML(string=html_content, base_url=url).write_pdf(pdf_filename) print(f"PDF file saved: {pdf_filename}") else: - print(f"No content with id '{id}' found on {url}") + print(f"No content found based on the provided ID or class on {url}") else: print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Save webpage content as PDF') parser.add_argument('url', type=str, help='URL of the webpage to scrape') - parser.add_argument('id', type=str, help='ID of the content to extract') + parser.add_argument('--id', type=str, help='ID of the content to extract') + parser.add_argument('--class', dest='class_name', type=str, help='Class name of the content to extract') + parser.add_argument('--exclude', nargs='+', help='Class names of elements to exclude') parser.add_argument('pdf_name', type=str, help='Name of the PDF file to save') args = parser.parse_args() - save_content_as_pdf(args.url, args.id, args.pdf_name) + save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude) |