Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving Comments in codebase #72

Merged
merged 1 commit into from
Mar 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 64 additions & 16 deletions analyze/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,27 @@
import seaborn as sns

warnings.filterwarnings("ignore")

# Third-party
from wordcloud import STOPWORDS, WordCloud # noqa: E402

# Set the current working directory
CWD = os.path.dirname(os.path.abspath(__file__))


def tags_frequency(csv_path, column_names):
# attribute csv_path is string
# attribute column_names is a list
# i.e. column_names = ["tags", "description"]
"""
This function is to generate a word cloud
based on all the tags of each license
each license one cloud
Generate a word cloud based on all the tags of each license.
Each license has its own cloud.

Args:
- csv_path (str): Path to the CSV file containing data.
- column_names (list): List of column names to process.
Example: ["tags", "description"]

"""
df = pd.read_csv(csv_path)
# Process each column containing tags
for column_name in column_names:
list2 = []
if column_name == "tags":
Expand All @@ -56,7 +61,7 @@ def tags_frequency(csv_path, column_names):
text = ""
stopwords = set(STOPWORDS)

# The stop words can be customized based on diff cases
# Customize stop words for the word cloud
flickr_customized = {
"nan",
"https",
Expand Down Expand Up @@ -103,7 +108,7 @@ def tags_frequency(csv_path, column_names):
# Join the lowercase words with a space separator
text = " ".join(lowercase_words)

# Creating the word cloud
# Creating WordCloud
tags_word_cloud = WordCloud(
width=800,
height=800,
Expand All @@ -112,7 +117,7 @@ def tags_frequency(csv_path, column_names):
min_font_size=10,
).generate(text)

# Plotting the word cloud
# Plotting the WordCloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(tags_word_cloud, interpolation="bilinear")
plt.axis("off")
Expand All @@ -129,12 +134,21 @@ def tags_frequency(csv_path, column_names):


def time_trend_helper(df):
"""
Extract year-wise count of entries from a DataFrame.

Args:
df (DataFrame): Input DataFrame containing dates.

Returns:
DataFrame: DataFrame with counts of entries per year.
"""
year_list = []
for date_row in df["dates"][0:]:
date_list = str(date_row).split()
year_list.append(date_list[0])
df["Dates"] = year_list

# Count occurrences of each year
# Use rename_axis for name of column from index and reset_index
count_df = (
df["Dates"]
Expand All @@ -143,11 +157,18 @@ def time_trend_helper(df):
.rename_axis("Dates")
.reset_index(name="Counts")
)
# Remove first and last rows
count_df = count_df.drop([0, len(count_df) - 1])
return count_df


def time_trend(csv_path):
"""
Generate a line graph to show the time trend of the license usage.

Args:
csv_path (str): Path to the CSV file.
"""
df = pd.read_csv(csv_path)
count_df = time_trend_helper(df)

Expand Down Expand Up @@ -181,9 +202,13 @@ def time_trend(csv_path):

def time_trend_compile_helper(yearly_count):
"""
yearly_count is the dataframe with "year" and "Counts" as two columns
This function will return counts - the list of "Counts" with the
condition that their corresponding "year" is between [2000, 2022]
Filter yearly trend data for the years between 2018 and 2022.

Args:
yearly_count (DataFrame): DataFrame with "year" and "Counts" columns.

Returns:
DataFrame: Filtered yearly count data.
"""
Years = np.arange(2018, 2023)
yearly_count["year"] = list(yearly_count.index)
Expand All @@ -201,6 +226,9 @@ def time_trend_compile_helper(yearly_count):


def time_trend_compile():
"""
Compile yearly trends for different licenses and plot them.
"""
license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv")
license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv")
license3 = pd.read_csv("../flickr/dataset/cleaned_license3.csv")
Expand All @@ -209,6 +237,7 @@ def time_trend_compile():
license6 = pd.read_csv("../flickr/dataset/cleaned_license6.csv")
license9 = pd.read_csv("../flickr/dataset/cleaned_license9.csv")
license10 = pd.read_csv("../flickr/dataset/cleaned_license10.csv")
# Calculate yearly counts for each license
count_df1 = time_trend_helper(license1)
count_df2 = time_trend_helper(license2)
count_df3 = time_trend_helper(license3)
Expand Down Expand Up @@ -240,7 +269,6 @@ def time_trend_compile():
each_raw_data.dropna(how="all")
list_data.append(each_raw_data)

# We set years are from 2000 to 2022
yearly_count1 = list_data[0].to_frame()
yearly_count2 = list_data[1].to_frame()
yearly_count3 = list_data[2].to_frame()
Expand All @@ -249,6 +277,7 @@ def time_trend_compile():
yearly_count6 = list_data[5].to_frame()
yearly_count9 = list_data[6].to_frame()
yearly_count10 = list_data[7].to_frame()
# Filter yearly count data for the years between 2018 and 2022
yearly_count1 = time_trend_compile_helper(yearly_count1)
yearly_count2 = time_trend_compile_helper(yearly_count2)
yearly_count3 = time_trend_compile_helper(yearly_count3)
Expand All @@ -259,7 +288,7 @@ def time_trend_compile():
yearly_count10 = time_trend_compile_helper(yearly_count10)
print(yearly_count1)

# plot lines
# Plot yearly trend for all licenses
plt.plot(
yearly_count1["Years"],
yearly_count1["Yearly_counts"],
Expand Down Expand Up @@ -337,6 +366,15 @@ def time_trend_compile():


def view_compare_helper(df):
"""
Calculate maximum views of pictures under a license.

Args:
df (DataFrame): Input DataFrame.

Returns:
int: Maximum views.
"""
highest_view = int(max(df["views"]))
df = df.sort_values("views", ascending=False)
return highest_view
Expand All @@ -345,6 +383,9 @@ def view_compare_helper(df):


def view_compare():
"""
Compare maximum views of pictures under different licenses.
"""
license1 = pd.read_csv(
os.path.join(CWD, "../flickr/dataset/cleaned_license1.csv")
)
Expand Down Expand Up @@ -379,10 +420,12 @@ def view_compare():
license9,
license10,
]
# Calculate maximum views for each license
maxs = []
for lic in licenses:
maxs.append(view_compare_helper(lic))
print(maxs)
# Create DataFrame to store license and their maximum views
temp_data = pd.DataFrame()
temp_data["Licenses"] = [
"CC BY-NC-SA 2.0",
Expand All @@ -395,6 +438,7 @@ def view_compare():
"Public Domain Mark 1.0",
]
temp_data["views"] = maxs
# Plot bar graph
fig, ax = plt.subplots(figsize=(13, 10))
ax.grid(b=True, color="grey", linestyle="-.", linewidth=0.5, alpha=0.6)
sns.set_style("dark")
Expand Down Expand Up @@ -433,7 +477,10 @@ def view_compare():


def total_usage():
# this will use the license total file as input dataset
"""
Generate a bar plot showing the total usage of different licenses.
"""
# Reads the license total file as the input dataset
df = pd.read_csv(os.path.join(CWD, "../flickr/dataset/license_total.csv"))
df["License"] = [str(x) for x in list(df["License"])]
fig = px.bar(df, x="License", y="Total amount", color="License")
Expand All @@ -448,6 +495,7 @@ def main():


if __name__ == "__main__":
# Exception handling
try:
main()
except SystemExit as e:
Expand Down
Loading