aboutsummarylogtreecommitdiffstats
path: root/web2pdf.py
diff options
context:
space:
mode:
Diffstat (limited to 'web2pdf.py')
-rw-r--r--web2pdf.py65
1 files changed, 24 insertions, 41 deletions
diff --git a/web2pdf.py b/web2pdf.py
index 9b3759e4383a..a22c70fb7e56 100644
--- a/web2pdf.py
+++ b/web2pdf.py
@@ -1,10 +1,9 @@
import argparse
import requests
from bs4 import BeautifulSoup
-from reportlab.pdfgen import canvas
-from reportlab.lib.pagesizes import letter
+import weasyprint
-def save_content_as_pdf(url, id, pdf_name):
+def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes=None):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
@@ -12,53 +11,37 @@ def save_content_as_pdf(url, id, pdf_name):
response = requests.get(url, headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
- novel_content = soup.find(id=id)
- if novel_content:
+
+ if id:
+ content = soup.find(id=id)
+ elif class_name:
+ content = soup.find(class_=class_name)
+ else:
+ content = soup.body
+
+ if content:
+ if exclude_classes:
+ for exclude_class in exclude_classes:
+ elements = content.find_all(class_=exclude_class)
+ for element in elements:
+ element.decompose()
+
pdf_filename = f"{pdf_name}.pdf"
- c = canvas.Canvas(pdf_filename, pagesize=letter)
- c.setFont("Helvetica", 12)
-
- # Define padding values
- top_padding = 30
- left_padding = 30
- right_padding = 30
- bottom_padding = 30
-
- content_text = novel_content.get_text(separator='\n')
-
- lines = content_text.split('\n')
-
- y_position = 800 - top_padding
- for line in lines:
- if y_position < bottom_padding:
- c.showPage()
- y_position = 800 - top_padding
-
- words = line.split()
- wrapped_line = ''
- for word in words:
- if c.stringWidth(wrapped_line + ' ' + word) < 500 - left_padding - right_padding:
- wrapped_line += ' ' + word
- else:
- c.drawString(left_padding, y_position, wrapped_line.strip())
- y_position -= 20
- wrapped_line = word
- if wrapped_line:
- c.drawString(left_padding, y_position, wrapped_line.strip())
- y_position -= 20
-
- c.save()
+ html_content = str(content)
+ pdf = weasyprint.HTML(string=html_content, base_url=url).write_pdf(pdf_filename)
print(f"PDF file saved: {pdf_filename}")
else:
- print(f"No content with id '{id}' found on {url}")
+ print(f"No content found based on the provided ID or class on {url}")
else:
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Save webpage content as PDF')
parser.add_argument('url', type=str, help='URL of the webpage to scrape')
- parser.add_argument('id', type=str, help='ID of the content to extract')
+ parser.add_argument('--id', type=str, help='ID of the content to extract')
+ parser.add_argument('--class', dest='class_name', type=str, help='Class name of the content to extract')
+ parser.add_argument('--exclude', nargs='+', help='Class names of elements to exclude')
parser.add_argument('pdf_name', type=str, help='Name of the PDF file to save')
args = parser.parse_args()
- save_content_as_pdf(args.url, args.id, args.pdf_name)
+ save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude)