diff options
Diffstat (limited to 'web2pdf.py')
-rw-r--r-- | web2pdf.py | 65 |
1 files changed, 24 insertions, 41 deletions
diff --git a/web2pdf.py b/web2pdf.py index 9b3759e4383a..a22c70fb7e56 100644 --- a/web2pdf.py +++ b/web2pdf.py @@ -1,10 +1,9 @@ import argparse import requests from bs4 import BeautifulSoup -from reportlab.pdfgen import canvas -from reportlab.lib.pagesizes import letter +import weasyprint -def save_content_as_pdf(url, id, pdf_name): +def save_content_as_pdf(url, pdf_name, id=None, class_name=None, exclude_classes=None): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } @@ -12,53 +11,37 @@ def save_content_as_pdf(url, id, pdf_name): response = requests.get(url, headers) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') - novel_content = soup.find(id=id) - if novel_content: + + if id: + content = soup.find(id=id) + elif class_name: + content = soup.find(class_=class_name) + else: + content = soup.body + + if content: + if exclude_classes: + for exclude_class in exclude_classes: + elements = content.find_all(class_=exclude_class) + for element in elements: + element.decompose() + pdf_filename = f"{pdf_name}.pdf" - c = canvas.Canvas(pdf_filename, pagesize=letter) - c.setFont("Helvetica", 12) - - # Define padding values - top_padding = 30 - left_padding = 30 - right_padding = 30 - bottom_padding = 30 - - content_text = novel_content.get_text(separator='\n') - - lines = content_text.split('\n') - - y_position = 800 - top_padding - for line in lines: - if y_position < bottom_padding: - c.showPage() - y_position = 800 - top_padding - - words = line.split() - wrapped_line = '' - for word in words: - if c.stringWidth(wrapped_line + ' ' + word) < 500 - left_padding - right_padding: - wrapped_line += ' ' + word - else: - c.drawString(left_padding, y_position, wrapped_line.strip()) - y_position -= 20 - wrapped_line = word - if wrapped_line: - c.drawString(left_padding, y_position, wrapped_line.strip()) - y_position -= 20 - - c.save() + html_content = str(content) + pdf = weasyprint.HTML(string=html_content, base_url=url).write_pdf(pdf_filename) print(f"PDF file saved: {pdf_filename}") else: - print(f"No content with id '{id}' found on {url}") + print(f"No content found based on the provided ID or class on {url}") else: print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Save webpage content as PDF') parser.add_argument('url', type=str, help='URL of the webpage to scrape') - parser.add_argument('id', type=str, help='ID of the content to extract') + parser.add_argument('--id', type=str, help='ID of the content to extract') + parser.add_argument('--class', dest='class_name', type=str, help='Class name of the content to extract') + parser.add_argument('--exclude', nargs='+', help='Class names of elements to exclude') parser.add_argument('pdf_name', type=str, help='Name of the PDF file to save') args = parser.parse_args() - save_content_as_pdf(args.url, args.id, args.pdf_name) + save_content_as_pdf(args.url, args.pdf_name, args.id, args.class_name, args.exclude) |