In [ ]:
!pip install python-docx
!pip install textract
In [ ]:
# Reading a .docx File into a DataFrame
import pandas as pd
from docx import Document
def read_docx_to_dataframe(file_path):
doc = Document(file_path)
data = {''''Text'''': [para.text for para in doc.paragraphs]}
df = pd.DataFrame(data)
return df
df = read_docx_to_dataframe(''''sample.docx'''')
print(df)
In [ ]:
# Writing a DataFrame to a .docx File
import pandas as pd
from docx import Document
# Sample DataFrame
data = {
''''Name'''': [''''Alice'''', ''''Bob'''', ''''Charlie''''],
''''Age'''': [25, 30, 35],
''''City'''': [''''New York'''', ''''Los Angeles'''', ''''Chicago'''']
}
df = pd.DataFrame(data)
def write_dataframe_to_docx(df, file_path):
doc = Document()
table = doc.add_table(rows=1, cols=len(df.columns))
hdr_cells = table.rows[0].cells
for i, column in enumerate(df.columns):
hdr_cells[i].text = column
for index, row in df.iterrows():
row_cells = table.add_row().cells
for i, value in enumerate(row):
row_cells[i].text = str(value)
doc.save(file_path)
write_dataframe_to_docx(df, ''''output.docx'''')
In [ ]:
# Reading a .doc File into a DataFrame
import pandas as pd
import textract
def read_doc_to_dataframe(file_path):
text = textract.process(file_path).decode(''''utf-8'''')
data = {''''Text'''': text.split(''''\n'''')}
df = pd.DataFrame(data)
return df
df = read_doc_to_dataframe(''''sample.doc'''')
print(df)
In [ ]:
# Writing a DataFrame to a .doc File
import pandas as pd
from docx import Document
import subprocess
def write_dataframe_to_docx(df, file_path):
doc = Document()
table = doc.add_table(rows=1, cols=len(df.columns))
hdr_cells = table.rows[0].cells
for i, column in enumerate(df.columns):
hdr_cells[i].text = column
for index, row in df.iterrows():
row_cells = table.add_row().cells
for i, value in enumerate(row):
row_cells[i].text = str(value)
doc.save(file_path)
def convert_docx_to_doc(docx_path, doc_path):
subprocess.run([''''libreoffice'''', ''''--headless'''', ''''--convert-to'''', ''''doc'''', docx_path, ''''--outdir'''', ''''.''''])
subprocess.run([''''mv'''', docx_path.replace(''''.docx'''', ''''.doc''''), doc_path])
data = {
''''Name'''': [''''Alice'''', ''''Bob'''', ''''Charlie''''],
''''Age'''': [25, 30, 35],
''''City'''': [''''New York'''', ''''Los Angeles'''', ''''Chicago'''']
}
df = pd.DataFrame(data)
write_dataframe_to_docx(df, ''''output.docx'''')
convert_docx_to_doc(''''output.docx'''', ''''output.doc'''')
In [ ]:
# importing multiple documents docx into a dataframe
import os
import pandas as pd
from docx import Document
def read_docx(file_path):
doc = Document(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return ''''\n''''.join(full_text)
folder_path = ''''path/to/your/folder''''
documents = []
for filename in os.listdir(folder_path):
if filename.endswith(''''.docx''''):
file_path = os.path.join(folder_path, filename)
doc_text = read_docx(file_path)
documents.append({''''filename'''': filename, ''''text'''': doc_text})
df = pd.DataFrame(documents)
print(df)