aboutsummarylogtreecommitdiffstats
path: root/web2pdf.py
diff options
context:
space:
mode:
authorYour Name2024-04-18 19:27:34 +0330
committerYour Name2024-04-18 19:27:34 +0330
commitac4a91fb9467ae1fca20253c390da6ca65a59a6a (patch)
treebb3e45140b91f25ea30a3691c733d20f0987a8dc /web2pdf.py
parent93193a3f808a7e170a0e1ef6b999bddf7c8a0371 (diff)
downloadaur-ac4a91fb9467ae1fca20253c390da6ca65a59a6a.tar.gz
web2pdf v3
Diffstat (limited to 'web2pdf.py')
-rw-r--r--web2pdf.py77
1 files changed, 74 insertions, 3 deletions
diff --git a/web2pdf.py b/web2pdf.py
index a22c70fb7e56..b140bb57dd7a 100644
--- a/web2pdf.py
+++ b/web2pdf.py
@@ -2,10 +2,11 @@ import argparse
import requests
from bs4 import BeautifulSoup
import weasyprint
+import os
def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes=None):
headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers)
@@ -34,14 +35,84 @@ def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes
print(f"No content found based on the provided ID or class on {url}")
else:
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
+
+
+def save_and_delete_images(url, pdf_name, id=None, class_name=None, exclude_classes=None):
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+ }
+
+ response = requests.get(url, headers)
+ if response.status_code == 200:
+ soup = BeautifulSoup(response.content, 'html.parser')
+
+ if id:
+ content = soup.find(id=id)
+ elif class_name:
+ content = soup.find(class_=class_name)
+ else:
+ content = soup.body
+
+ if content:
+ if exclude_classes:
+ for exclude_class in exclude_classes:
+ elements = content.find_all(class_=exclude_class)
+ for element in elements:
+ element.decompose()
+
+ image_tags = content.find_all('img')
+
+ save_folder = pdf_name
+ image_filenames = []
+
+ if not os.path.exists(save_folder):
+ os.makedirs(save_folder)
+
+ for idx, img_tag in enumerate(image_tags):
+ img_url = img_tag.get('src')
+
+ img_response = requests.get(img_url)
+ if img_response.status_code == 200:
+ #save images
+ img_filename = os.path.join(save_folder, f"image_{idx}.jpg")
+ #save name to array
+ image_filenames.append(img_filename)
+
+ with open(img_filename, "wb") as file:
+ file.write(img_response.content)
+ print(f"Image downloaded and saved successfully: {img_filename}")
+ else:
+ print("Failed to download the image. Status code:", img_response.status_code)
+
+ # After saving all images, create a PDF using the 'convert' command in Linux
+
+ # Use the 'convert' command to create a PDF from images in the folder
+ image_files_str = " ".join(image_filenames)
+ convert_command = f"convert {image_files_str} {pdf_name}.pdf"
+
+ # Run the convert command using os.system
+ os.system(convert_command)
+
+ print(f"PDF file created from images: {pdf_name}")
+ else:
+ print(f"No content found based on the provided ID or class on {url}")
+ else:
+ print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
+
if __name__ == "__main__":
- parser = argparse.ArgumentParser(description='Save webpage content as PDF')
+ parser = argparse.ArgumentParser(description='Save webpage content as PDF or images')
+
parser.add_argument('url', type=str, help='URL of the webpage to scrape')
parser.add_argument('--id', type=str, help='ID of the content to extract')
parser.add_argument('--class', dest='class_name', type=str, help='Class name of the content to extract')
parser.add_argument('--exclude', nargs='+', help='Class names of elements to exclude')
+ parser.add_argument('--comic-mode', action='store_true', help='Save images and pdf them (like a real comic or manga)')
parser.add_argument('pdf_name', type=str, help='Name of the PDF file to save')
+
args = parser.parse_args()
- save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude)
+ if args.comic_mode:
+ save_and_delete_images(args.url, args.pdf_name, args.id, args.class_name, args.exclude)
+ else:
+ save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude)