You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# Get a mapping of column names to their data type
df.schema
# Get the dataframe's shape
df.shape
# Get list of column names
df.columns
# Get the first x rows of a dataset
df.head(3)
# Count number of rows
df.height
# Count number of columns
df.width
# Get info on dataset
df.describe()
# Counts dupe rows
df.is_duplicated()
# Counts number of unique rows
expr_unique_subset = pl.struct("a", "b").n_unique()
# Count null values per column
df.null_count()
# Count non-null values in each column
df.count()
# Count unique values in each group
df.group_by("d", maintain_order=True).n_unique()
Update dataset
# Add columns to dataframe by passing a list of expressions
df.with_columns(
[
(pl.col("a") ** 2).alias("a^2"),
(pl.col("b") / 2).alias("b/2"),
(pl.col("c").not_()).alias("not c"),
]
# Sort by multiple columns
df.sort("c", "a", descending=[False, True])
# Set column names
df.columns = ["apple", "banana", "orange"]
# Fill floating point NaN value with a fill value
df.with_columns(pl.col("b").fill_nan(0))
# Filter the expression based on one or more predicate expressions.
# The original order of the remaining elements is preserved.
# Elements where the filter does not evaluate to True are discarded, including nulls.
df.group_by("group_col").agg(
lt=pl.col("b").filter(pl.col("b") < 2).sum(),
gte=pl.col("b").filter(pl.col("b") >= 2).sum(),
).sort("group_col")
# Replace multiple values by passing sequences to the old and new parameters
df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200]))
# Transpose data
df.transpose(include_header=True)
# Remove columns from df
df.drop(["bar", "ham"])
# Drop a subset of columns, as defined by name or with a selector
df.drop_nulls(subset=cs.integer())
# Rename columns
df.rename({"foo": "apple"})
# Add column showing % change between rows
df.with_columns(pl.col("a").pct_change().alias("pct_change"))
# Sum multiple columns
df.select(pl.sum("a", "c"))
# Sum all values horizontally across columns
df.with_columns(sum=pl.sum_horizontal("a", "b"))
# Count the occurrences of unique values in a column
df.select(pl.n_unique("b", "c"))
# Generate an index column by using len in conjunction with int_range()
df.select(
pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"),
pl.all(),
)
# Get the maximum value horizontally across columns
df.with_columns(max=pl.max_horizontal("a", "b"))
shape: (3, 4)
Categorical Filters and queries
# Replace all matching regex/literal substrings with a new string value
df.with_columns(pl.col("text").str.replace_all("a", "-"))
# apply case-insensitive string replacement
df.with_columns(
pl.col("weather").str.replace_all(
r"(?i)foggy|rainy|cloudy|snowy", "Sunny"
)
)
# split text to columns using delimiter
df.with_columns(
pl.col("s").str.split(by="_").alias("split"))
# Remove leading and trailing characters
df.with_columns(foo_stripped=pl.col("foo").str.strip_chars())
# Count characters in all strings in a column
df.with_columns(
pl.col("a").str.len_chars().alias("n_chars"))
# Check if string contains a substring that matches a pattern and doesn't use regex
df.select(pl.col("txt").str.contains("rab$", literal=True).alias("literal"))
# determines if any of the patterns find a match
df = pl.DataFrame(
{
"lyrics": [
"Everybody wants to rule the world",
"Tell me what you want, what you really really want",
"Can you feel the love tonight",
]
}
)
df.with_columns(
pl.col("lyrics").str.contains_any(["you", "me"]).alias("contains_any")
# Return the index position of the first substring matching a pattern.
df.select(
pl.col("txt"),
pl.col("txt").str.find("a|e").alias("a|e (regex)"),
pl.col("txt").str.find("e", literal=True).alias("e (lit)"),
)
Numerical Filters and queries
# Iterate over the groups of the group by operation
for name, data in df.group_by(["foo"]):
print(name)
print(data)
# Compute aggregations for each group of a group by operation
df.group_by("a").agg(pl.col("b"), pl.col("c"))
# Compute multiple aggregates at once by passing a list of expressions
df.group_by("a").agg([pl.sum("b"), pl.mean("c")])
# Compute the sum of a column for each group
df.group_by("a").agg(pl.col("b").sum())
# Filter on multiple conditions
df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a"))
# Create a spreadsheet-style pivot table as a DataFrame
df.pivot(index="foo", columns="bar", values="baz", aggregate_function="sum")
# Pivot using selectors to determine the index/values/columns
df.pivot(
index=cs.string(),
columns=cs.string(),
values=cs.numeric(),
aggregate_function="sum",
sort_columns=True,
).sort(
by=cs.string(),
)
# Select columns from df
df.select(["foo", "bar"])
# Start a when-then-otherwise expression
df.with_columns(pl.when(pl.col("foo") > 2).then(1).otherwise(-1).alias("val"))
# Window function for grouping by single column
df.with_columns(
pl.col("c").max().over("a").name.suffix("_max"),
)
# Window function for grouping by multiple columns by passing a list of column names or expressions
df.with_columns(
pl.col("c").min().over(["a", "b"]).name.suffix("_min"),
)
Numpy
# Return pairwise Pearson product-moment correlation coefficients between columns. Requires numpy to be installed.
df.corr()
Pandas
EDA
# Get info on dataset
df.info()
# List all df columns
df.columns
# Show first few rows of dataset
df.head()
# Show last few rows of dataset
df.tail()
# Number of rows in df
len(df)
# counts null values
df.isnull()
# counts NA values
df.isna()
# counts values that aren't NA
df.notna()
# assert that there are no missing values in the dataframe
assert pd.notnull(df).all().all()
# assert all values are greater than 0
assert (df >=0).all().all()
Filters and queries
# Show specific rows and columns
df.iloc[2:10, 5:10]
# Show specific row numbers and column names
df.loc[[3, 10, 14, 23], ["nationality", "weight_kg", "height_cm"]]
# Filter rows in df
df.query("shooting > passing")
# Show all dtypes of df columns
df.dtypes
# Find rows where string starts with
df[df.first_name.str.startswith("J")]
# select the variables or columns of a certain data type
df.select_dtypes(include="int64")
# query a dataset based on a boolean condition
df["random_col"].where(df["random_col"] > 50)
# used to find out the unique values of a categorical column
df.skill_moves.unique()
# remove duplicates from a list
unique_list = list(set(numbers))
# find most common element
from collections import Counter
most_common = Counter(numbers).most_common(1)
# make a subset of the dataset that will contain only a few nationalities of players using .isin() function
nationality = ["Argentina", "Portugal", "Sweden", "England"]
df[df.nationality.isin(nationality)]
# provides some basic statistical measures
df["wage_eur"].describe()
# number of data in the DataFrame in the specified direction.
# When the direction is 0, it provides the number of data in the columns.
# When the direction is 1, it provides the number of data in the rows
df.count(level="Pclass")
Totals and grouping
# Rolling Methods
## rolling sum
data["rolling_sum_2"] = data.rolling(window = 2).Sales.sum()
## rolling sum over 2 days
data["rolling_sum_2days"] = data.rolling(window = '2d', on = "Date").Sales.sum()
## grouped rolling sum
data["rolling_sum_grouped"] = data.groupby("Payment_type").rolling(window = 2).\
Sales.sum().reset_index().set_index("level_1").\
sort_index()["Sales"]
## rolling mean aka moving average
data["Date"] = pd.to_datetime(data["Date"])
data["rolling_avg_3days"] = data.rolling(window = 6, min_periods=1).Sales.mean()
# provides you with the cumulative sum of all the values of the previous rows
df[["value_eur", "wage_eur"]].cumsum()
# lets you know how many unique values do you have in a column
df.nationality.nunique()
# group the data as per a certain variable
df.groupby("death")[
"creatinine_phosphokinase",
"ejection_fraction",
"platelets",
"serum_creatinine",
"serum_sodium",
"time",
].agg([np.mean, np.median])
.sum()
# value counts of each category
df["Pclass"].value_counts(sort=True, ascending=True, normalize=True)
# Melted the data
melted_df_single_var = pd.melt(df, id_vars=['Date', 'City'], var_name='Variable', value_name='Value')
# Melt the data with multiple value_vars
melted_df = pd.melt(df, id_vars=['Date', 'City'], value_vars=['Temperature', 'Humidity'], var_name='Variable', value_name='Value')
# Stack - pivot the innermost column index to become the innermost row index
stacked_df = df.set_index(['Date', 'City']).stack()
# Unstack - pivot the innermost row index to become the innermost column index
unstacked_df = stacked_df.unstack()
# Pivot Table
pivot_table_df = df.pivot_table(index='Date', columns='City', values=['Temperature', 'Humidity'], aggfunc='mean')
# Wide to Long
long_df = pd.wide_to_long(df, stubnames=['Temperature', 'Humidity'], i=['Date', 'City'], j='Variable', sep='_').reset_index()
pd.crosstab(
df["league_rank"], df["international_reputation"]
) # gives you a frequency table that is a cross-tabulation of two variables
# bins the data or segments the data based on the distribution of the data
pd.qcut(df["value_eur"], q=5)
# provide number of bins and split up dataset
pd.cut(df["value_eur"], bins=5).value_counts()
# percentage change between the current and a prior element in a Series or DataFrame
data = {
"date": pd.date_range(start="2022-01-01", end="2022-01-05"),
"price": [100, 105, 98, 110, 120],
}
df = pd.DataFrame(data).set_index("date")
df["price"].pct_change()
# Calculate rolling mean with a 7-day window
rolling_mean = data['value'].rolling(window=7).mean()
# Compute exponential moving average (EMA)
ema = data['value'].ewm(span=10).mean()
# find intersection of sets
set1 = {1, 2, 3}
set2 = {3, 4, 5}
intersection = set1 & set2
Update dataset
# When we reset the index, the old index is added as a column, and a new sequential index is used:
df.reset_index(names=['classes', 'names'])
# slice list
subset = numbers[2:5]
# list comprehension
squared_dict = {x: x**2 for x in range(5)}
# zip lists
names = ['Alice', 'Bob', 'Charlie']
ages = [25, 30, 35]
person_info = dict(zip(names, ages))
# Append dataframes using result
result = df1.append([df2, df3])
# Append dataframes using pd.concat
frames = [df1, df2, df3]
result = pd.concat(frames)
# Drop columns from df
df = df.drop(columns=["Unnamed: 0", "weak_foot", "real_face"])
# replaces the values of a column
df.replace(1.0, 1.1)
# rename columns
df.rename(columns={"weight_kg": "Weight (kg)", "height_cm": "Height (cm)"})
# Update segment of data
df.loc[(df['COUNCIL'] == 'LEEDS') & (df['POSTCODE'] == 'LS8'), ['CCG']] = 'LEEDS NORTH CCG'
# replaces the null values with some other value of your choice
df["pace"].fillna(0, inplace=True)
# drop null values
df.dropna()
# Drop duplicates
people = people.drop_duplicates(subset="Name")
# Drop columns from dataframe
to_drop = ['Edition Statement', 'Corporate Author', 'Corporate Contributors', 'Former owner', 'Engraver', 'Contributors',
'Issuance type',
'Shelfmarks']
df.drop(to_drop, inplace=True, axis=1)
# Convert to DateTime format
pd.to_datetime(people["Graduation"])
# Change all strings to lower case in a column
people["Standing"] = people["Standing"].str.lower()
# Handling invalid values
df["height(cm)"] = pd.to_numeric(df["height(cm)"], errors='coerce')
# Split columns
df[['age','sex']] = df.age_sex.str.split("_", expand = True)
# Reorder column labels
df = df[['fname','lname','age','sex','section','height(cm)','weight(kg)','spend_A','spend_B','spend_C']]
# unpivots a DataFrame from wide format to long format
# massage a DataFrame into a format where one or more columns are identifier variables, while all other columns, considered measured variables, are unpivoted to the row axis, leaving just two non-identifier columns, variable and value.
pd.melt(df, id_vars=['A'], value_vars=['B'])
# Explode function to to reformat it in a way that there is a separate row for each item in that list
df_new = df.explode(column="data").reset_index(drop=True)
# Assign: reate a new DataFrame with additional columns, assigning values based on existing columns or operations
df.assign(value_cat=np.where(df["Value"] > 20, "high", "low"))
df.assign(value_cat=np.where(df["Value"] > 20, "high", "low")).groupby(
"value_cat"
).mean()
#combine_first: choosing values from the first Series and filling in any missing values with the corresponding values from the second Series
s1 = pd.Series([1, 2, np.nan, 4, np.nan, 6])
s2 = pd.Series([10, np.nan, 30, 40, np.nan, 60])
s1.combine_first(s2)
s3 = pd.Series([1, 2, 3, 4, 5, 6])
s1.combine_first(s2).combine_first(s3)
# Sort List based on another List
list1 = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
list2 = [ 0, 1, 1, 1, 2, 2, 0, 1, 1, 3, 4]
C = [x for _, x in sorted(zip(list2, list1), key=lambda pair: pair[0])]
print(C) # ['a', 'g', 'b', 'c', 'd', 'h', 'i', 'e', 'f', 'j', 'k']
# Reading lines from a file until an empty line is encountered using walrus
# allows for assignment and return of a value within an expression
with open('myfile.txt') as file:
while (line := file.readline().rstrip()):
print(line)
Numpy
# Find out number of dimensions in array
df.ndim
# generate equally spaced values within a given interval
np.arange(5, 11, 2)
# Create linearly spaced array
linear_spaced = np.linspace(0, 1, 5) # 5 values from 0 to 1
# Accessing Array Elements
element = a[2] # Retrieve the third element of array 'a'
row = reshaped[1, :] # Retrieve the second row of 'reshaped'
# Boolean indexing
filtered = a[a > 2] # Elements of 'a' greater than 2
# transpose a NumPy array
a = np.arange(12).reshape((6, 2))
a.transpose()
# sort the array in-place
np.sort(a) ## sort based on rows
np.sort(a, axis=None) ## sort the flattened array
np.sort(a, axis=0) ## sort based on columns
Regular expressions
# Replace text
replaced_text = re.sub(r"string", "sentence", text)
print(replaced_text)
Misc
# capitalize text
capitalized_text = text.capitalize()
# generate a GUID
import uuid
guid = uuid.uuid4()
# check for leap year
def is_leap_year(year): return (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)
# Convert F to C
celsius = (fahrenheit - 32) * 5/9
# Create progress bar
import time from tqdm
import tqdm
for i in tqdm(range(100)): time.sleep(0.1)
# Get current working directory
import os
cwd = os.getcwd()
# Chain custom functions
result = (
df.pipe(subtract_federal_tax)
.pipe(subtract_state_tax, rate=0.12)
.pipe(
(subtract_national_insurance, 'df'),
rate=0.05,
rate_increase=0.02
)
)
print(result)
# # Normalize numeric columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['numeric_column']] = scaler.fit_transform(df[['numeric_column']]
# pretty printing
import pprint
data = {'a': [1, 2, 3], 'b': [4, 5, 6]}
pprint.pprint(data)