In [1]:
#!pip install pandas pyarrow
In [2]:
# Writing a DataFrame to a Parquet File
import pandas as pd
data = {
''''Name'''': [''''Alice'''', ''''Bob'''', ''''Charlie''''],
''''Age'''': [25, 30, 35],
''''City'''': [''''New York'''', ''''Los Angeles'''', ''''Chicago'''']
}
df = pd.DataFrame(data)
df.to_parquet(''''sample1.parquet'''', engine=''''pyarrow'''', index=False)
In [3]:
# Reading a Parquet File into a DataFrame
import pandas as pd
df = pd.read_parquet(''''sample1.parquet'''', engine=''''pyarrow'''')
print(df)
Name Age City 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago
In [4]:
# Writing a DataFrame to a Partitioned Parquet File
import pandas as pd
data = {
''''Name'''': [''''Alice'''', ''''Bob'''', ''''Charlie'''', ''''David''''],
''''Age'''': [25, 30, 35, 40],
''''City'''': [''''New York'''', ''''Los Angeles'''', ''''Chicago'''', ''''London'''']
}
df = pd.DataFrame(data)
df.to_parquet(''''sample2.parquet'''', engine=''''pyarrow'''', partition_cols=[''''City''''], index=False,existing_data_behavior=''''delete_matching'''')
In [5]:
# Reading a Partitioned Parquet File
import pandas as pd
df = pd.read_parquet(''''sample2.parquet'''', engine=''''pyarrow'''')
print(df)
Name Age City 0 Charlie 35 Chicago 1 David 40 London 2 Bob 30 Los Angeles 3 Alice 25 New York
In [6]:
# Reading Multiple Parquet Files into a Single DataFrame
import pandas as pd
import glob
#parquet_files = glob.glob(''''path/to/parquet/files/*.parquet'''')
parquet_files = glob.glob(''''*.parquet'''')
df = pd.concat([pd.read_parquet(file, engine=''''pyarrow'''') for file in parquet_files])
print(df)
Name Age City 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago 0 Charlie 35 Chicago 1 David 40 London 2 Bob 30 Los Angeles 3 Alice 25 New York