In [ ]:
!pip install pandas pymupdf
!pip install reportlab
In [ ]:
# importing multiple documents pdf into a dataframe
import os
import pandas as pd
import fitz # PyMuPDF
def read_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
return text
folder_path = ''''path/to/your/folder''''
documents = []
for filename in os.listdir(folder_path):
if filename.endswith(''''.pdf''''):
file_path = os.path.join(folder_path, filename)
doc_text = read_pdf(file_path)
documents.append({''''filename'''': filename, ''''text'''': doc_text})
df = pd.DataFrame(documents)
print(df)
In [ ]:
# Writing DataFrame to PDF Files
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
def write_text_to_pdf(text, file_path):
c = canvas.Canvas(file_path, pagesize=letter)
width, height = letter
c.drawString(72, height - 72, text)
c.save()
for index, row in df.iterrows():
output_file = f"output_{row[''''filename'''']}_page_{row[''''page_number'''']}.pdf"
write_text_to_pdf(row[''''text''''], output_file)
In [ ]:
# Writing multiple pages to pdf document
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
# Sample DataFrame
data = {
''''filename'''': [''''doc1'''', ''''doc1'''', ''''doc2'''', ''''doc2''''],
''''page_number'''': [1, 2, 1, 2],
''''text'''': [
''''This is the text of page 1 of doc1.'''',
''''This is the text of page 2 of doc1.'''',
''''This is the text of page 1 of doc2.'''',
''''This is the text of page 2 of doc2.''''
]
}
df = pd.DataFrame(data)
def write_dataframe_to_pdf(df, output_file):
c = canvas.Canvas(output_file, pagesize=letter)
width, height = letter
for index, row in df.iterrows():
c.drawString(72, height - 72, f"Filename: {row[''''filename'''']}")
c.drawString(72, height - 90, f"Page Number: {row[''''page_number'''']}")
text = row[''''text'''']
lines = text.split(''''\n'''')
y = height - 120
for line in lines:
c.drawString(72, y, line)
y -= 15
c.showPage()
c.save()
write_dataframe_to_pdf(df, ''''output.pdf'''')
In [ ]:
!pip install pdfkit
!pip install reportlab
In [ ]:
# Converting HTML to PDF
import pdfkit
# HTML content
html_content = """
Sample HTML
Hello, World!
This is a sample HTML to PDF conversion.
"""
# Convert HTML string to PDF
pdfkit.from_string(html_content, ''''output.pdf'''')
In [ ]:
# Converting an HTML File to PDF
import pdfkit
# Path to the HTML file
html_file = ''''sample.html''''
# Convert HTML file to PDF
pdfkit.from_file(html_file, ''''output.pdf'''')
In [ ]:
# Converting a URL to PDF
import pdfkit
# URL of the web page
url = ''''https://www.example.com''''
# Convert URL to PDF
pdfkit.from_url(url, ''''output.pdf'''')
In [ ]:
# more options
import pdfkit
html_content = """
Sample HTML
Hello, World!
This is a sample HTML to PDF conversion.
"""
options = {
''''page-size'''': ''''A4'''',
''''orientation'''': ''''Portrait'''',
''''margin-top'''': ''''10mm'''',
''''margin-right'''': ''''10mm'''',
''''margin-bottom'''': ''''10mm'''',
''''margin-left'''': ''''10mm''''
}
pdfkit.from_string(html_content, ''''output.pdf'''', options=options)
In [ ]:
# Adding Images to a PDF
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
def create_pdf_with_images(output_file):
c = canvas.Canvas(output_file, pagesize=letter)
width, height = letter
# Add some text
c.drawString(72, height - 72, "Hello, World!")
c.drawString(72, height - 90, "This is a sample PDF with images.")
# Add an image
image_path = ''''path/to/your/image.jpg''''
image = ImageReader(image_path)
c.drawImage(image, 72, height - 300, width=200, height=200)
# Add another image
another_image_path = ''''path/to/another/image.png''''
another_image = ImageReader(another_image_path)
c.drawImage(another_image, 300, height - 300, width=200, height=200)
c.save()
create_pdf_with_images(''''output_with_images.pdf'''')
In [ ]:
# Adding Images to Each Page from a DataFrame
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
data = {
''''text'''': [''''Page 1 text'''', ''''Page 2 text'''', ''''Page 3 text''''],
''''image_path'''': [''''path/to/image1.jpg'''', ''''path/to/image2.jpg'''', ''''path/to/image3.jpg'''']
}
df = pd.DataFrame(data)
def create_pdf_from_dataframe(df, output_file):
c = canvas.Canvas(output_file, pagesize=letter)
width, height = letter
for index, row in df.iterrows():
# Add text
c.drawString(72, height - 72, row[''''text''''])
# Add image
image = ImageReader(row[''''image_path''''])
c.drawImage(image, 72, height - 300, width=200, height=200)
# Create a new page
c.showPage()
c.save()
create_pdf_from_dataframe(df, ''''output_from_dataframe.pdf'''')
In [ ]:
# HTML content with image links to pdf
import pdfkit
html_content = """
Sample HTML with Images
Welcome to My PDF
This is a sample HTML file with images.
"""
with open(''''sample.html'''', ''''w'''') as file:
file.write(html_content)
pdfkit.from_file(''''sample.html'''', ''''output.pdf'''')