diff options
author | Your Name | 2024-04-18 19:27:34 +0330 |
---|---|---|
committer | Your Name | 2024-04-18 19:27:34 +0330 |
commit | ac4a91fb9467ae1fca20253c390da6ca65a59a6a (patch) | |
tree | bb3e45140b91f25ea30a3691c733d20f0987a8dc | |
parent | 93193a3f808a7e170a0e1ef6b999bddf7c8a0371 (diff) | |
download | aur-ac4a91fb9467ae1fca20253c390da6ca65a59a6a.tar.gz |
web2pdf v3
-rw-r--r-- | .SRCINFO | 1 | ||||
-rw-r--r-- | PKGBUILD | 2 | ||||
-rw-r--r-- | README.md | 48 | ||||
-rw-r--r-- | web2pdf.py | 77 |
4 files changed, 122 insertions, 6 deletions
@@ -9,6 +9,7 @@ pkgbase = web2pdf depends = python-requests depends = python-beautifulsoup4 depends = python-weasyprint + depends = imagemagick source = web2pdf.py md5sums = SKIP @@ -6,7 +6,7 @@ pkgdesc="A CLI tool to extract a part of a website, create a PDF" arch=('any') url="https://github.com/simit22/web2pdf" license=('GPL') -depends=('python' 'python-requests' 'python-beautifulsoup4' 'python-weasyprint') +depends=('python' 'python-requests' 'python-beautifulsoup4' 'python-weasyprint' 'imagemagick') source=("web2pdf.py") md5sums=('SKIP') diff --git a/README.md b/README.md index 4851ac7c2919..898937fbc856 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ A CLI tool to extract a part of a website, create a PDF (new version now support both class and id , even some styling ) +(new version now support --comic-mode read document for more info ) + ------ > let me tell u a really fun story @@ -43,7 +45,7 @@ available in aur if u wanna compile it yourself -1. clone repository +1. clone repository 2. go to cloned file @@ -60,6 +62,49 @@ if u wanna compile it yourself 6. u can even make build its binary yourself it is easy + + +# how to use ? + +``` usage: web2pdf.py [-h] [--id ID] [--class CLASS_NAME] + +usage: web2pdf.py [-h] [--id ID] [--class CLASS_NAME] + [--exclude EXCLUDE [EXCLUDE ...]] [--comic-mode] + url pdf_name + +Save webpage content as PDF or images + +positional arguments: + url URL of the webpage to scrape + pdf_name Name of the PDF file to save + +options: + -h, --help show this help message and exit + --id ID ID of the content to extract + --class CLASS_NAME Class name of the content to extract + --exclude EXCLUDE [EXCLUDE ...] + Class names of elements to exclude + --comic-mode Save images and pdf them (like a real comic or manga)``` + +``` + +- `--comic-mode` : sometimes u wanna download a manga or comic from INTERNET + + they have a part that comic is saved using very long images that r put tougher + + downloading them one by one and make a pdf out of it is hard and somehow impossible + + you can use web2pdf using `--comic-mode` these times + + 1. it will make a dir with the same name of pdf and save all of page images + 2. and than make a pdf out of it + +- ` --id ID ID of the content to extract + --class CLASS_NAME Class name of the content to extract + --exclude EXCLUDE [EXCLUDE ...]` + + these args r optional by default it will make a pdf out of all website + # what to do next @@ -73,4 +118,3 @@ if u wanna compile it yourself in the end i will be happy if u share your ideas about this script with me TY so much ❤️ - diff --git a/web2pdf.py b/web2pdf.py index a22c70fb7e56..b140bb57dd7a 100644 --- a/web2pdf.py +++ b/web2pdf.py @@ -2,10 +2,11 @@ import argparse import requests from bs4 import BeautifulSoup import weasyprint +import os def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes=None): headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } response = requests.get(url, headers) @@ -34,14 +35,84 @@ def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes print(f"No content found based on the provided ID or class on {url}") else: print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") + + +def save_and_delete_images(url, pdf_name, id=None, class_name=None, exclude_classes=None): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + + response = requests.get(url, headers) + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + + if id: + content = soup.find(id=id) + elif class_name: + content = soup.find(class_=class_name) + else: + content = soup.body + + if content: + if exclude_classes: + for exclude_class in exclude_classes: + elements = content.find_all(class_=exclude_class) + for element in elements: + element.decompose() + + image_tags = content.find_all('img') + + save_folder = pdf_name + image_filenames = [] + + if not os.path.exists(save_folder): + os.makedirs(save_folder) + + for idx, img_tag in enumerate(image_tags): + img_url = img_tag.get('src') + + img_response = requests.get(img_url) + if img_response.status_code == 200: + #save images + img_filename = os.path.join(save_folder, f"image_{idx}.jpg") + #save name to array + image_filenames.append(img_filename) + + with open(img_filename, "wb") as file: + file.write(img_response.content) + print(f"Image downloaded and saved successfully: {img_filename}") + else: + print("Failed to download the image. Status code:", img_response.status_code) + + # After saving all images, create a PDF using the 'convert' command in Linux + + # Use the 'convert' command to create a PDF from images in the folder + image_files_str = " ".join(image_filenames) + convert_command = f"convert {image_files_str} {pdf_name}.pdf" + + # Run the convert command using os.system + os.system(convert_command) + + print(f"PDF file created from images: {pdf_name}") + else: + print(f"No content found based on the provided ID or class on {url}") + else: + print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") + if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Save webpage content as PDF') + parser = argparse.ArgumentParser(description='Save webpage content as PDF or images') + parser.add_argument('url', type=str, help='URL of the webpage to scrape') parser.add_argument('--id', type=str, help='ID of the content to extract') parser.add_argument('--class', dest='class_name', type=str, help='Class name of the content to extract') parser.add_argument('--exclude', nargs='+', help='Class names of elements to exclude') + parser.add_argument('--comic-mode', action='store_true', help='Save images and pdf them (like a real comic or manga)') parser.add_argument('pdf_name', type=str, help='Name of the PDF file to save') + args = parser.parse_args() - save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude) + if args.comic_mode: + save_and_delete_images(args.url, args.pdf_name, args.id, args.class_name, args.exclude) + else: + save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude) |