In [ ]:
import pandas as pd
In [ ]:
# Manually Creating a DataFrame
data = {
"Name": ["Alice", "Bob", "Charlie"],
"Age": [25, 30, 35]
}
df = pd.DataFrame(data)
print(df)
In [ ]:
data = {
''''PatientID'''': [1, 2, 3],
''''Name'''': [''''John Doe'''', ''''Jane Smith'''', ''''Emily Davis''''],
''''Age'''': [45, 38, 50],
''''Diagnosis'''': [''''Hypertension'''', ''''Diabetes'''', ''''Asthma'''']
}
df_manual = pd.DataFrame(data)
print(df_manual)
CSV¶
In [ ]:
# Assuming ''''healthcare_data.csv'''' is your file
df_csv = pd.read_csv(''''healthcare_data.csv'''')
print(df_csv.head())
Parquet¶
In [ ]:
# Assuming ''''healthcare_data.parquet'''' is your file
df_parquet = pd.read_parquet(''''healthcare_data.parquet'''')
print(df_parquet.head())
XLS¶
In [ ]:
df_xls = pd.read_excel(''''healthcare_data.xls'''')
print(df_xls.head())
XLSX¶
In [ ]:
df_xlsx = pd.read_excel(''''healthcare_data.xlsx'''')
print(df_xlsx.head())
GZ¶
In [ ]:
import pandas as pd
import gzip
with gzip.open(''''healthcare_data.csv.gz'''', ''''rt'''') as f:
df_gz = pd.read_csv(f)
print(df_gz.head())
ZIP¶
In [ ]:
!pip install zipfile36
In [ ]:
import pandas as pd
import zipfile
with zipfile.ZipFile(''''healthcare_data.zip'''') as z:
with z.open(''''healthcare_data.csv'''') as f:
df_zip = pd.read_csv(f)
print(df_zip.head())
In [ ]:
import pandas as pd
import zipfile
import io
# Path to your ZIP file
zip_file_path = ''''path/to/your/file.zip''''
# Create a ZipFile object
with zipfile.ZipFile(zip_file_path, ''''r'''') as z:
# List all files in the ZIP
file_list = z.namelist()
# Read each CSV file into a DataFrame and concatenate them
df_list = []
for file_name in file_list:
if file_name.endswith(''''.csv''''):
with z.open(file_name) as f:
df = pd.read_csv(f)
df_list.append(df)
# Concatenate all DataFrames
combined_df = pd.concat(df_list, ignore_index=True)
print(combined_df.head())
TAR¶
In [ ]:
import pandas as pd
import tarfile
with tarfile.open(''''healthcare_data.tar.gz'''', ''''r:gz'''') as tar:
for member in tar.getmembers():
if member.isfile():
f = tar.extractfile(member)
df_tar = pd.read_csv(f)
print(df_tar.head())
In [ ]:
import pandas as pd
import tarfile
# Path to your TAR file
tar_file_path = ''''path/to/your/file.tar''''
# Create a TarFile object
with tarfile.open(tar_file_path, ''''r'''') as t:
# List all files in the TAR
file_list = t.getnames()
# Read each CSV file into a DataFrame and concatenate them
df_list = []
for file_name in file_list:
if file_name.endswith(''''.csv''''):
f = t.extractfile(file_name)
df = pd.read_csv(f)
df_list.append(df)
# Concatenate all DataFrames
combined_df = pd.concat(df_list, ignore_index=True)
print(combined_df.head())
Reading multiple CSV files in a directory¶
In [ ]:
import pandas as pd
import glob
# Specify the path to the directory containing the CSV files
path = ''''path/to/your/csv/files/''''
# Use glob to get all the CSV files in the directory
all_files = glob.glob(path + "*.csv")
# Create an empty list to hold the DataFrames
dfs = []
# Loop through the list of files and read each one into a DataFrame
for file in all_files:
df = pd.read_csv(file)
dfs.append(df)
# Concatenate all the DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)
# Display the combined DataFrame
print(combined_df)
Google Cloud Storage¶
In [ ]:
!pip install google-cloud-storage
In [ ]:
import pandas as pd
from google.cloud import storage
# Replace with your actual credentials and bucket/file details
credentials_json = {
"type": "service_account",
"project_id": "your-project-id",
"private_key_id": "your-private-key-id",
"private_key": "your-private-key",
"client_email": "your-client-email",
"client_id": "your-client-id",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "your-client-x509-cert-url"
}
# Authenticate
client = storage.Client.from_service_account_info(credentials_json)
bucket = client.get_bucket(''''your-bucket-name'''')
blob = bucket.blob(''''your-file.csv'''')
# Download the file as a string
data = blob.download_as_string().decode(''''utf-8'''')
# Create DataFrame
df = pd.read_csv(pd.compat.StringIO(data))
print(df.head())
In [ ]:
from google.cloud import storage
import pandas as pd
# Replace with your actual credentials and bucket details
credentials_json = {
"type": "service_account",
"project_id": "your-project-id",
"private_key_id": "your-private-key-id",
"private_key": "your-private-key",
"client_email": "your-client-email",
"client_id": "your-client-id",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "your-client-x509-cert-url"
}
client = storage.Client.from_service_account_info(credentials_json)
bucket = client.get_bucket(''''your-bucket-name'''')
# List all files in the bucket
blobs = bucket.list_blobs(prefix=''''your-folder/'''')
# Read multiple CSV files into DataFrames
dfs = []
for blob in blobs:
if blob.name.endswith(''''.csv''''):
data = blob.download_as_string().decode(''''utf-8'''')
df = pd.read_csv(pd.compat.StringIO(data))
dfs.append(df)
# Combine all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)
print(combined_df.head())
AWS S3¶
In [ ]:
!pip install boto3
In [ ]:
import pandas as pd
import boto3
from io import StringIO
# Replace with your actual credentials and bucket/file details
aws_access_key_id = ''''YOUR_ACCESS_KEY''''
aws_secret_access_key = ''''YOUR_SECRET_KEY''''
bucket_name = ''''your-bucket-name''''
file_key = ''''your-file.csv''''
# Authenticate and fetch the file from S3
s3 = boto3.client(''''s3'''', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
data = obj[''''Body''''].read().decode(''''utf-8'''')
# Create DataFrame
df = pd.read_csv(StringIO(data))
print(df.head())
In [ ]:
import boto3
import pandas as pd
from io import StringIO
# Replace with your actual credentials and bucket details
aws_access_key_id = ''''YOUR_ACCESS_KEY''''
aws_secret_access_key = ''''YOUR_SECRET_KEY''''
bucket_name = ''''your-bucket-name''''
folder_prefix = ''''your-folder/''''
s3 = boto3.client(''''s3'''', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
# List all files in the bucket
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
files = [content[''''Key''''] for content in response.get(''''Contents'''', []) if content[''''Key''''].endswith(''''.csv'''')]
# Read multiple CSV files into DataFrames
dfs = []
for file_key in files:
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
data = obj[''''Body''''].read().decode(''''utf-8'''')
df = pd.read_csv(StringIO(data))
dfs.append(df)
# Combine all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)