forked from duckdb/duckdb-web
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
129 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import duckdb | ||
import time | ||
|
||
con = duckdb.connect() | ||
|
||
def generate_df(size): | ||
race_categories = ['Hobbit', 'Elf', 'Man','Mayar'] | ||
race = np.random.choice(race_categories, size) | ||
subrace = np.random.choice(race_categories, size) | ||
return pd.DataFrame({'race': pd.Categorical(race), | ||
'subrace': pd.Categorical(subrace), | ||
'race_string': race, | ||
'subrace_string': subrace,}) | ||
|
||
size = pow(10,7) #10,000,000 rows | ||
df = generate_df(size) | ||
|
||
def duck_group(df): | ||
start_time = time.monotonic() | ||
res = con.execute("select race, count(*) from df group by race").df() | ||
print ("Duck (Cat) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def duck_group_string(df): | ||
start_time = time.monotonic() | ||
res = con.execute("select race_string, count(*) from df group by race_string").df() | ||
print ("Duck (Str) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def pandas_group(df): | ||
start_time = time.monotonic() | ||
res = df.groupby(['race']).agg({'race': 'count'}) | ||
print ("Pandas (Cat) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def pandas_group_string(df): | ||
start_time = time.monotonic() | ||
res = df.groupby(['race_string']).agg({'race_string': 'count'}) | ||
print ("Pandas (String) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def duck_filter(df): | ||
start_time = time.monotonic() | ||
res = con.execute("select count(*) from df where race = 'Hobbit'").df() | ||
print ("Duck (Cat) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def duck_filter_string(df): | ||
start_time = time.monotonic() | ||
res = con.execute("select count(*) from df where race_string = 'Hobbit'").df() | ||
print ("Duck (Str) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def pandas_filter(df): | ||
start_time = time.monotonic() | ||
filtered_df = df[df.race == "Hobbit"] | ||
res = filtered_df.agg({'race': 'count'}) | ||
print ("Pandas (Cat) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def pandas_filter_string(df): | ||
start_time = time.monotonic() | ||
filtered_df = df[df.race_string == "Hobbit"] | ||
res = filtered_df.agg({'race_string': 'count'}) | ||
print ("Pandas (String) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
|
||
def duck_comparison(df): | ||
start_time = time.monotonic() | ||
res = con.execute("select count(*) from df where race = subrace").df() | ||
print ("Duck (Cat) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def duck_comparison_string(df): | ||
start_time = time.monotonic() | ||
res = con.execute("select count(*) from df where race_string = subrace_string").df() | ||
print ("Duck (Str) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def pandas_comparison(df): | ||
start_time = time.monotonic() | ||
filtered_df = df[df.race == df.subrace] | ||
res = filtered_df.agg({'race': 'count'}) | ||
print ("Pandas (Cat) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
def pandas_comparison_string(df): | ||
start_time = time.monotonic() | ||
filtered_df = df[df.race_string == df.subrace_string] | ||
res = filtered_df.agg({'race_string': 'count'}) | ||
print ("Pandas (Str) : " + str(time.monotonic() - start_time)) | ||
return res | ||
|
||
|
||
print ("--- Group ---") | ||
duck_group(df) | ||
duck_group_string(df) | ||
pandas_group(df) | ||
pandas_group_string(df) | ||
|
||
print ("--- Filter ---") | ||
duck_filter(df) | ||
duck_filter_string(df) | ||
pandas_filter(df) | ||
pandas_filter_string(df) | ||
|
||
print ("--- Enum - Enum Comparison ---") | ||
duck_comparison(df) | ||
duck_comparison_string(df) | ||
pandas_comparison(df) | ||
pandas_comparison_string(df) | ||
|
||
def storage(): | ||
race_categories = ['Hobbit', 'Elf', 'Man','Mayar'] | ||
race = np.random.choice(race_categories, size) | ||
categorical_race = pd.DataFrame({'race': pd.Categorical(race),}) | ||
string_race = pd.DataFrame({'race': race,}) | ||
categorical_race.to_pickle("./pandas_cat.pkl") | ||
string_race.to_pickle("./pandas_str.pkl") | ||
con = duckdb.connect('duck_cat.db') | ||
con.execute("CREATE TABLE T as select * from categorical_race") | ||
con = duckdb.connect('duck_str.db') | ||
con.execute("CREATE TABLE T as select * from string_race") | ||
|
||
storage() | ||
|