Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat] Streamlit 앱에 실험 데이터 분석을 위한 기능 추가 #41

Merged
merged 12 commits into from
Nov 23, 2024
Merged
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
HF_TOKEN={your_hf_token}
STREAMLIT_DATA_PATH={streamlit_data_path}
STREAMLIT_DATA_PATH={streamlit_data_path}
STREAMLIT_EXPERIMENT_DATA_PATH={streamlit_experiment_data_path}
13 changes: 13 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "streamlit debug",
"type": "debugpy",
"request": "launch",
"module": "streamlit",
"args": ["run", "${file}"],
"justMyCode": true
}
]
}
29 changes: 15 additions & 14 deletions analysis_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dotenv import load_dotenv
from streamlit_option_menu import option_menu

from streamlit_utils import access_data_by_index, display_data_summary, display_question_format, filter_data_by_column
from streamlit_utils import display_data_summary, display_data_tab

if __name__ == "__main__":

Expand All @@ -24,35 +24,36 @@
if selected == "Home":
st.title("📊 Data Analysis")
uploaded_file = st.sidebar.file_uploader("Upload a CSV file for analysis", type="csv")
tab1, tab2, tab3 = st.tabs(["📊 데이터 개요", "🔍 데이터 탐색", "📈 데이터 분포"])
experiment_file = st.sidebar.file_uploader("Upload a experiment result CSV file for analysis", type="csv")
tab1, tab2, tab3, tab4 = st.tabs(["📊 데이터 개요", "🔍 데이터 탐색", "📈 데이터 분포", "🔬 실험 데이터"])

if uploaded_file:
df = pd.read_csv(uploaded_file)
else:
# 첨부 파일이 없으면 기본적으로 train.csv에 대한 분석을 출력합니다.
# 첨부 파일이 없으면 기본적으로 설정한 학습 데이터에 대한 분석을 출력합니다.
# .env에서 STREAMLIT_DATA_PATH, STREAMLIT_EXPERIMENT_DATA_PATH에 각각 학습 데이터, 실험 데이터를 설정하세요.
df = pd.read_csv(os.getenv("STREAMLIT_DATA_PATH"))
if experiment_file:
exp_df = pd.read_csv(experiment_file)
else:
exp_df = pd.read_csv(os.getenv("STREAMLIT_EXPERIMENT_DATA_PATH"))

# 데이터 요약
with tab1:
display_data_summary(df)

# 개별 데이터 접근
with tab2:
st.subheader("전체 데이터 확인")
st.dataframe(df)

st.subheader("개별 데이터 확인")
access_method = st.radio("데이터 접근 방식 선택", ("Access by Index", "Filter by Column"))
if access_method == "Access by Index":
access_data_by_index(df)
elif access_method == "Filter by Column":
filter_data_by_column(df)

display_question_format(df)
display_data_tab(df, "tab2")

# 분포 확인
with tab3:
st.subheader("데이터 분포")
# TODO: Add distribution plotting logic

# 실험 데이터 확인
with tab4:
display_data_tab(exp_df, "tab4")

elif selected == "Compare":
st.title("🆚 Compare Datasets")
1 change: 1 addition & 0 deletions streamlit_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .data_util import *
from .key_manager import *
49 changes: 40 additions & 9 deletions streamlit_utils/data_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@ def display_data_summary(df: pd.DataFrame):


# 인덱스 접근 함수
def access_data_by_index(df: pd.DataFrame):
def access_data_by_index(df: pd.DataFrame, tab_name: str):
st.markdown("#### Access Data by Index")
index_input = st.number_input(
"Enter the index of the row to retrieve:",
min_value=0,
max_value=len(df) - 1,
step=1,
key="unique_key_1",
key="index_input_" + tab_name,
)
if st.button("Retrieve by Index"):
if st.button("Retrieve by Index", key="index_retriever_" + tab_name):
if 0 <= index_input < len(df):
row_data = df.iloc[int(index_input)]
st.write(f"Row at index {int(index_input)}:")
Expand All @@ -39,12 +39,12 @@ def access_data_by_index(df: pd.DataFrame):


# 칼럼 필터링 함수
def filter_data_by_column(df: pd.DataFrame):
def filter_data_by_column(df: pd.DataFrame, tab_name: str):
st.markdown("#### Filter Data by Column")
column = st.selectbox("Select a column to filter by:", df.columns)
search_value = st.text_input(f"Enter the value to search in '{column}':")
column = st.selectbox("Select a column to filter by:", df.columns, key="column_filter_" + tab_name)
search_value = st.text_input(f"Enter the value to search in '{column}':", key="column_search_value_" + tab_name)

if st.button("Search"):
if st.button("Search", key="search_button_" + tab_name):
filtered_df = df[df[column].astype(str).str.contains(search_value, na=False, case=False, regex=False)]
result_count = len(filtered_df)
st.write(f"Number of rows containing '{search_value}' in column '{column}': {result_count}")
Expand All @@ -55,7 +55,7 @@ def filter_data_by_column(df: pd.DataFrame):


# 수능 형식으로 데이터 출력해주는 함수
def display_question_format(df: pd.DataFrame):
def display_question_format(df: pd.DataFrame, tab_name: str):
st.subheader("문제 형태로 확인")
required_columns = {"paragraph", "question", "choices"}
if not required_columns.issubset(df.columns):
Expand All @@ -66,7 +66,7 @@ def display_question_format(df: pd.DataFrame):
min_value=0,
max_value=len(df) - 1,
step=1,
key="unique_key_2",
key="question_idx_" + tab_name,
)
row = df.iloc[question_idx]
paragraph = row["paragraph"]
Expand All @@ -85,6 +85,14 @@ def display_question_format(df: pd.DataFrame):
st.markdown(body="#### 🔍 <보기>")
st.write(row["question_plus"])

default_columns = [
"id",
"paragraph",
"question",
"question_plus",
"choices",
"answer",
] # 제공된 데이터셋의 기본 열 이름 정보
choices_list = eval(choices) if isinstance(choices, str) else choices
st.markdown("#### 📝 선택지")
for idx, choice in enumerate(choices_list, start=1):
Expand All @@ -101,3 +109,26 @@ def display_question_format(df: pd.DataFrame):
if "answer" in df.columns:
st.markdown("#### ✅ 정답")
st.write(row["answer"])

# 기본 열이 아닌 생성된 열일 경우 추가로 렌더링 하는 기능
for column in df.columns:
if column not in default_columns:
st.markdown(f"#### {column}")
st.write(row[column])


# 데이터 분석 렌더링 모듈화
def display_data_tab(df: pd.DataFrame, tab_name: str):
st.subheader("전체 데이터 확인")
st.dataframe(df, key="dataframe_" + tab_name)

st.subheader("개별 데이터 확인")
access_method = st.radio(
"데이터 접근 방식 선택", ("Access by Index", "Filter by Column"), key="access_method_" + tab_name
)
if access_method == "Access by Index":
access_data_by_index(df, tab_name)
elif access_method == "Filter by Column":
filter_data_by_column(df, tab_name)

display_question_format(df, tab_name)