web2pdf v2

author: Your Name 2024-04-18 11:21:19 +0330
committer: Your Name 2024-04-18 11:21:19 +0330
commit: 93193a3f808a7e170a0e1ef6b999bddf7c8a0371 (patch)
tree: e642c0c94199851d96c0de7066f1a023f46a2fab
parent: ffab0040d89537e1db5b9c975afb8e14b9514920 (diff)
download: aur-93193a3f808a7e170a0e1ef6b999bddf7c8a0371.tar.gz
5 files changed, 103 insertions, 45 deletions
diff --git a/.SRCINFO b/.SRCINFO
index bf22b52727a7..60dde9b3c9f0 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,6 +1,6 @@
 pkgbase = web2pdf
 	pkgdesc = A CLI tool to extract a part of a website, create a PDF
-	pkgver = 1.0
+	pkgver = 2.0
 	pkgrel = 1
 	url = https://github.com/simit22/web2pdf
 	arch = any
@@ -8,7 +8,7 @@ pkgbase = web2pdf
 	depends = python
 	depends = python-requests
 	depends = python-beautifulsoup4
-	depends = python-reportlab
+	depends = python-weasyprint
 	source = web2pdf.py
 	md5sums = SKIP
 
diff --git a/PKGBUILD b/PKGBUILD
index 19692563d4b3..7fbd8e57b775 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -1,12 +1,12 @@
 # Maintainer: Your Name <your_email@example.com>
 pkgname=web2pdf
-pkgver=1.0
+pkgver=2.0
 pkgrel=1
 pkgdesc="A CLI tool to extract a part of a website, create a PDF"
 arch=('any')
 url="https://github.com/simit22/web2pdf"
 license=('GPL')
-depends=('python' 'python-requests' 'python-beautifulsoup4' 'python-reportlab')
+depends=('python' 'python-requests' 'python-beautifulsoup4' 'python-weasyprint')
 source=("web2pdf.py")
 md5sums=('SKIP')
 
diff --git a/README.md b/README.md
index c0dc6aace1b2..4851ac7c2919 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,76 @@
 # web2pdf
+
+A CLI tool to extract a part of a website, create a PDF
+
+(new version now support  both class and id , even some styling )
+
+------
+
+> let me tell u a really fun story
+> i am a manga and lightnovel fan 
+>
+> lightnovels r novels that never become a book 
+> r really better than many books 
+> they rarely become even pdf 
+> some of them even dont get translated to english and r still korean
+>
+> so i wanted to read them as pdf and doing it manually is really hard and boring
+>
+> so lets go to point 
+> i wrote my own tools to do so -------- web2pdf
+
+it is totally cool u just give it
+
+1. **web page** 
+2. **the part that contain novel or anything ( id or class )**
+
+and it do the job
+it make all of it to a  ***perfect pdf*** 
+
+it is called **web2pdf**
+
+
+
+# how to install
+
+if u use arch linux btw
+
+`yay -S web2pdf`
+
+available in aur 
+
+ https://aur.archlinux.org/packages/web2pdf
+
+if u wanna compile it yourself 
+
+1.  clone repository
+
+2. go to cloned file
+
+3. go to venv using 
+    `cd ./bin/    &  source activate` 
+
+4. install dependancies 
+
+   `pip install requests beautifulsoup4 reportlab`  
+
+5. run and enjoy using python 
+
+   `python web2pdf.py`
+
+6. u can even make build its binary  yourself it is easy
+
+
+
+# what to do next
+
+- [ ] maybe adding translation ability to cli tools
+
+  
+
+# end ?!
+
+in the end i will be happy if u share your ideas about this script with me 
+
+TY so much ❤️
+
diff --git a/web2pdf-1.0-1-any.pkg.tar.zst b/web2pdf-1.0-1-any.pkg.tar.zst
deleted file mode 100644
index 10f83fb21078..000000000000
--- a/web2pdf-1.0-1-any.pkg.tar.zst
+++ /dev/null
diff --git a/web2pdf.py b/web2pdf.py
index 9b3759e4383a..a22c70fb7e56 100644
--- a/web2pdf.py
+++ b/web2pdf.py
@@ -1,10 +1,9 @@
 import argparse
 import requests
 from bs4 import BeautifulSoup
-from reportlab.pdfgen import canvas
-from reportlab.lib.pagesizes import letter
+import weasyprint
 
-def save_content_as_pdf(url, id, pdf_name):
+def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes=None):
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
@@ -12,53 +11,37 @@ def save_content_as_pdf(url, id, pdf_name):
     response = requests.get(url, headers)
     if response.status_code == 200:
         soup = BeautifulSoup(response.content, 'html.parser')
-        novel_content = soup.find(id=id)
-        if novel_content:
+        
+        if id:
+            content = soup.find(id=id)
+        elif class_name:
+            content = soup.find(class_=class_name)
+        else:
+            content = soup.body
+
+        if content:
+            if exclude_classes:
+                for exclude_class in exclude_classes:
+                    elements = content.find_all(class_=exclude_class)
+                    for element in elements:
+                        element.decompose()
+
             pdf_filename = f"{pdf_name}.pdf"
-            c = canvas.Canvas(pdf_filename, pagesize=letter)
-            c.setFont("Helvetica", 12)
-            
-            # Define padding values
-            top_padding = 30
-            left_padding = 30
-            right_padding = 30
-            bottom_padding = 30
-            
-            content_text = novel_content.get_text(separator='\n')
-            
-            lines = content_text.split('\n')
-            
-            y_position = 800 - top_padding
-            for line in lines:
-                if y_position < bottom_padding:
-                    c.showPage()
-                    y_position = 800 - top_padding
-                
-                words = line.split()
-                wrapped_line = ''
-                for word in words:
-                    if c.stringWidth(wrapped_line + ' ' + word) < 500 - left_padding - right_padding:
-                        wrapped_line += ' ' + word
-                    else:
-                        c.drawString(left_padding, y_position, wrapped_line.strip())
-                        y_position -= 20
-                        wrapped_line = word
-                if wrapped_line:
-                    c.drawString(left_padding, y_position, wrapped_line.strip())
-                    y_position -= 20
-                
-            c.save()
+            html_content = str(content)
+            pdf = weasyprint.HTML(string=html_content, base_url=url).write_pdf(pdf_filename)
             print(f"PDF file saved: {pdf_filename}")
         else:
-            print(f"No content with id '{id}' found on {url}")
+            print(f"No content found based on the provided ID or class on {url}")
     else:
         print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Save webpage content as PDF')
     parser.add_argument('url', type=str, help='URL of the webpage to scrape')
-    parser.add_argument('id', type=str, help='ID of the content to extract')
+    parser.add_argument('--id', type=str, help='ID of the content to extract')
+    parser.add_argument('--class', dest='class_name', type=str, help='Class name of the content to extract')
+    parser.add_argument('--exclude', nargs='+', help='Class names of elements to exclude')
     parser.add_argument('pdf_name', type=str, help='Name of the PDF file to save')
     args = parser.parse_args()
 
-    save_content_as_pdf(args.url, args.id, args.pdf_name)
+    save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude)
author	Your Name	2024-04-18 11:21:19 +0330
committer	Your Name	2024-04-18 11:21:19 +0330
commit	93193a3f808a7e170a0e1ef6b999bddf7c8a0371 (patch)
tree	e642c0c94199851d96c0de7066f1a023f46a2fab
parent	ffab0040d89537e1db5b9c975afb8e14b9514920 (diff)
download	aur-93193a3f808a7e170a0e1ef6b999bddf7c8a0371.tar.gz