In [1]:
# Identifying and Removing Duplicate Rows
import pandas as pd
data = {
''''name'''': [''''Alice'''', ''''Bob'''', ''''Charlie'''', ''''Alice'''', ''''Bob''''],
''''age'''': [25, 30, 35, 25, 30],
''''city'''': [''''New York'''', ''''Los Angeles'''', ''''Chicago'''', ''''New York'''', ''''Los Angeles'''']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
duplicates = df.duplicated()
print("\nDuplicate Rows:")
print(duplicates)
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)
Original DataFrame: name age city 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago 3 Alice 25 New York 4 Bob 30 Los Angeles Duplicate Rows: 0 False 1 False 2 False 3 True 4 True dtype: bool DataFrame after removing duplicates: name age city 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago
In [2]:
# Removing Duplicates Based on Specific Columns
import pandas as pd
data = {
''''name'''': [''''Alice'''', ''''Bob'''', ''''Charlie'''', ''''Alice'''', ''''Bob''''],
''''age'''': [25, 30, 35, 25, 30],
''''city'''': [''''New York'''', ''''Los Angeles'''', ''''Chicago'''', ''''New York'''', ''''Los Angeles'''']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df_no_duplicates_name = df.drop_duplicates(subset=[''''name''''])
print("\nDataFrame after removing duplicates based on ''''name'''':")
print(df_no_duplicates_name)
Original DataFrame: name age city 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago 3 Alice 25 New York 4 Bob 30 Los Angeles DataFrame after removing duplicates based on ''''name'''': name age city 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago
In [3]:
# Keeping the Last Occurrence of Duplicate Rows
import pandas as pd
data = {
''''name'''': [''''Alice'''', ''''Bob'''', ''''Charlie'''', ''''Alice'''', ''''Bob''''],
''''age'''': [25, 30, 35, 25, 30],
''''city'''': [''''New York'''', ''''Los Angeles'''', ''''Chicago'''', ''''New York'''', ''''Los Angeles'''']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df_no_duplicates_last = df.drop_duplicates(keep=''''last'''')
print("\nDataFrame after removing duplicates, keeping the last occurrence:")
print(df_no_duplicates_last)
Original DataFrame: name age city 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago 3 Alice 25 New York 4 Bob 30 Los Angeles DataFrame after removing duplicates, keeping the last occurrence: name age city 2 Charlie 35 Chicago 3 Alice 25 New York 4 Bob 30 Los Angeles
In [4]:
# Counting Duplicate Rows
import pandas as pd
data = {
''''name'''': [''''Alice'''', ''''Bob'''', ''''Charlie'''', ''''Alice'''', ''''Bob''''],
''''age'''': [25, 30, 35, 25, 30],
''''city'''': [''''New York'''', ''''Los Angeles'''', ''''Chicago'''', ''''New York'''', ''''Los Angeles'''']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
duplicate_count = df.duplicated().sum()
print("\nNumber of duplicate rows:")
print(duplicate_count)
Original DataFrame: name age city 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago 3 Alice 25 New York 4 Bob 30 Los Angeles Number of duplicate rows: 2