web2pdf v3

author: Your Name 2024-04-18 19:27:34 +0330
committer: Your Name 2024-04-18 19:27:34 +0330
commit: ac4a91fb9467ae1fca20253c390da6ca65a59a6a (patch)
tree: bb3e45140b91f25ea30a3691c733d20f0987a8dc
parent: 93193a3f808a7e170a0e1ef6b999bddf7c8a0371 (diff)
download: aur-ac4a91fb9467ae1fca20253c390da6ca65a59a6a.tar.gz
4 files changed, 122 insertions, 6 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 60dde9b3c9f0..1b0cd4fff901 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -9,6 +9,7 @@ pkgbase = web2pdf
 	depends = python-requests
 	depends = python-beautifulsoup4
 	depends = python-weasyprint
+	depends = imagemagick
 	source = web2pdf.py
 	md5sums = SKIP
 
diff --git a/PKGBUILD b/PKGBUILD
index 7fbd8e57b775..3909532f3dec 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -6,7 +6,7 @@ pkgdesc="A CLI tool to extract a part of a website, create a PDF"
 arch=('any')
 url="https://github.com/simit22/web2pdf"
 license=('GPL')
-depends=('python' 'python-requests' 'python-beautifulsoup4' 'python-weasyprint')
+depends=('python' 'python-requests' 'python-beautifulsoup4' 'python-weasyprint' 'imagemagick')
 source=("web2pdf.py")
 md5sums=('SKIP')
 
diff --git a/README.md b/README.md
index 4851ac7c2919..898937fbc856 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@ A CLI tool to extract a part of a website, create a PDF
 
 (new version now support  both class and id , even some styling )
 
+(new version now support  --comic-mode read document for more info )
+
 ------
 
 > let me tell u a really fun story
@@ -43,7 +45,7 @@ available in aur
 
 if u wanna compile it yourself 
 
-1.  clone repository
+1. clone repository
 
 2. go to cloned file
 
@@ -60,6 +62,49 @@ if u wanna compile it yourself
 
 6. u can even make build its binary  yourself it is easy
 
+    
+
+# how to use ?
+
+``` usage: web2pdf.py [-h] [--id ID] [--class CLASS_NAME]
+     
+usage: web2pdf.py [-h] [--id ID] [--class CLASS_NAME]
+                  [--exclude EXCLUDE [EXCLUDE ...]] [--comic-mode]
+                  url pdf_name
+
+Save webpage content as PDF or images
+
+positional arguments:
+  url                   URL of the webpage to scrape
+  pdf_name              Name of the PDF file to save
+
+options:
+  -h, --help            show this help message and exit
+  --id ID               ID of the content to extract
+  --class CLASS_NAME    Class name of the content to extract
+  --exclude EXCLUDE [EXCLUDE ...]
+                        Class names of elements to exclude
+  --comic-mode          Save images and pdf them (like a real comic or manga)``` 
+
+```
+
+- `--comic-mode`    : sometimes u wanna download a manga or comic from INTERNET
+
+  they have a part that comic is saved using very long images that r put tougher  
+
+  downloading them one by one and make a pdf out of it is hard and somehow impossible 
+
+  you can use web2pdf using `--comic-mode` these times 
+
+  1. it will make a dir with the same name of pdf and save all of page images 
+  2. and than make a pdf out of it
+
+- ` --id ID               ID of the content to extract
+    --class CLASS_NAME    Class name of the content to extract
+    --exclude EXCLUDE [EXCLUDE ...]`
+
+  these args r optional by default it will make a pdf out of all website 
+
 
 
 # what to do next
@@ -73,4 +118,3 @@ if u wanna compile it yourself
 in the end i will be happy if u share your ideas about this script with me 
 
 TY so much ❤️
-
diff --git a/web2pdf.py b/web2pdf.py
index a22c70fb7e56..b140bb57dd7a 100644
--- a/web2pdf.py
+++ b/web2pdf.py
@@ -2,10 +2,11 @@ import argparse
 import requests
 from bs4 import BeautifulSoup
 import weasyprint
+import os
 
 def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes=None):
     headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
 
     response = requests.get(url, headers)
@@ -34,14 +35,84 @@ def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes
             print(f"No content found based on the provided ID or class on {url}")
     else:
         print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
+        
+
+def save_and_delete_images(url, pdf_name, id=None, class_name=None, exclude_classes=None):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+    }
+
+    response = requests.get(url, headers)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.content, 'html.parser')
+        
+        if id:
+            content = soup.find(id=id)
+        elif class_name:
+            content = soup.find(class_=class_name)
+        else:
+            content = soup.body
+
+        if content:
+            if exclude_classes:
+                for exclude_class in exclude_classes:
+                    elements = content.find_all(class_=exclude_class)
+                    for element in elements:
+                        element.decompose()
+
+            image_tags = content.find_all('img')
+            
+            save_folder = pdf_name
+            image_filenames = []
+            
+            if not os.path.exists(save_folder):
+                os.makedirs(save_folder)
+
+            for idx, img_tag in enumerate(image_tags):
+                img_url = img_tag.get('src')
+                
+                img_response = requests.get(img_url)
+                if img_response.status_code == 200:
+					#save images
+                    img_filename = os.path.join(save_folder, f"image_{idx}.jpg")
+                    #save name to array
+                    image_filenames.append(img_filename)
+                    
+                    with open(img_filename, "wb") as file:
+                        file.write(img_response.content)
+                    print(f"Image downloaded and saved successfully: {img_filename}")
+                else:
+                    print("Failed to download the image. Status code:", img_response.status_code)
+
+            # After saving all images, create a PDF using the 'convert' command in Linux
+
+            # Use the 'convert' command to create a PDF from images in the folder
+            image_files_str = " ".join(image_filenames)
+            convert_command = f"convert {image_files_str} {pdf_name}.pdf"
+            
+            # Run the convert command using os.system
+            os.system(convert_command)
+
+            print(f"PDF file created from images: {pdf_name}")
+        else:
+            print(f"No content found based on the provided ID or class on {url}")
+    else:
+        print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
+
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Save webpage content as PDF')
+    parser = argparse.ArgumentParser(description='Save webpage content as PDF or images')
+
     parser.add_argument('url', type=str, help='URL of the webpage to scrape')
     parser.add_argument('--id', type=str, help='ID of the content to extract')
     parser.add_argument('--class', dest='class_name', type=str, help='Class name of the content to extract')
     parser.add_argument('--exclude', nargs='+', help='Class names of elements to exclude')
+    parser.add_argument('--comic-mode', action='store_true', help='Save images and pdf them (like a real comic or manga)')
     parser.add_argument('pdf_name', type=str, help='Name of the PDF file to save')
+
     args = parser.parse_args()
 
-    save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude)
+    if args.comic_mode:
+        save_and_delete_images(args.url, args.pdf_name, args.id, args.class_name, args.exclude)
+    else:
+        save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude)
author	Your Name	2024-04-18 19:27:34 +0330
committer	Your Name	2024-04-18 19:27:34 +0330
commit	ac4a91fb9467ae1fca20253c390da6ca65a59a6a (patch)
tree	bb3e45140b91f25ea30a3691c733d20f0987a8dc
parent	93193a3f808a7e170a0e1ef6b999bddf7c8a0371 (diff)
download	aur-ac4a91fb9467ae1fca20253c390da6ca65a59a6a.tar.gz