Python Pandas 2

Data Manipulation

Advanced techniques for manipulating and transforming data in pandas DataFrames.

import pandas as pd

# Sample data
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 30, 35, 28],
    'Salary': [50000, 60000, 70000, 55000],
    'Department': ['IT', 'HR', 'IT', 'Finance']
})

# Sorting data
df_sorted = df.sort_values('Age')
print(df_sorted)

# Filtering data
high_salary = df[df['Salary'] > 55000]
print(high_salary)

# Adding calculated columns
df['Annual_Bonus'] = df['Salary'] * 0.1
print(df)

Grouping Data

Group data by categories and perform aggregate operations.

# Group by department
dept_groups = df.groupby('Department')

# Aggregate functions
print(dept_groups['Salary'].mean())    # Average salary by department
print(dept_groups['Age'].max())        # Max age by department
print(dept_groups.size())              # Count by department

# Multiple aggregations
agg_result = dept_groups.agg({
    'Salary': ['mean', 'max', 'min'],
    'Age': ['mean', 'count']
})
print(agg_result)

Merging DataFrames

Combine multiple DataFrames using various join operations.

# Create additional DataFrame
df2 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Eve'],
    'City': ['New York', 'London', 'Paris']
})

# Inner join
merged_inner = pd.merge(df, df2, on='Name', how='inner')
print(merged_inner)

# Left join
merged_left = pd.merge(df, df2, on='Name', how='left')
print(merged_left)

# Concatenating DataFrames
df3 = pd.DataFrame({
    'Name': ['Frank'],
    'Age': [32],
    'Salary': [65000],
    'Department': ['Marketing']
})

combined = pd.concat([df, df3], ignore_index=True)
print(combined)

File Operations

Read from and write to various file formats including CSV, Excel, and JSON.

# Reading CSV file
df_csv = pd.read_csv('data.csv')

# Reading Excel file
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# Reading JSON file
df_json = pd.read_json('data.json')

# Writing to CSV
df.to_csv('output.csv', index=False)

# Writing to Excel
df.to_excel('output.xlsx', sheet_name='Data', index=False)

# Writing to JSON
df.to_json('output.json', orient='records')

Handling Missing Data

Techniques for dealing with missing or null values in datasets.

# Create DataFrame with missing values
import numpy as np

df_missing = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, 7, 8],
    'C': [9, 10, 11, np.nan]
})

# Check for missing values
print(df_missing.isnull())
print(df_missing.isnull().sum())

# Drop rows with missing values
df_dropped = df_missing.dropna()
print(df_dropped)

# Fill missing values
df_filled = df_missing.fillna(0)  # Fill with 0
df_filled_mean = df_missing.fillna(df_missing.mean())  # Fill with mean
print(df_filled_mean)