Add Entity Bridge app page.

GSA · Dec 2, 2024 · ce3a153 · ce3a153
1 parent bea4ee1
commit ce3a153
Show file tree

Hide file tree

Showing 12 changed files with 131 additions and 10 deletions.
diff --git a/streamlit_app/app/.gitignore b/streamlit_app/app/.gitignore
@@ -1,3 +1,3 @@
 *.json
 !*_promptvariables.json
-entity_bridge_promptvariables.json
+my*.json
diff --git a/streamlit_app/app/data_unificator/components/deduplication_ui.py b/streamlit_app/app/data_unificator/components/deduplication_ui.py
@@ -1,4 +1,4 @@
-# components/deduplication_ui.py
+# data_unificator/components/deduplication_ui.py
 
 import streamlit as st
 from streamlit.logger import get_logger

diff --git a/streamlit_app/app/data_unificator/components/import_ui.py b/streamlit_app/app/data_unificator/components/import_ui.py
@@ -1,4 +1,4 @@
-# components/import_ui.py
+# data_unificator/components/import_ui.py
 
 import streamlit as st
 from streamlit.logger import get_logger

diff --git a/streamlit_app/app/data_unificator/components/mapping_ui.py b/streamlit_app/app/data_unificator/components/mapping_ui.py
@@ -1,4 +1,4 @@
-# components/mapping_ui.py
+# data_unificator/components/mapping_ui.py
 
 import os
 import streamlit as st

diff --git a/streamlit_app/app/data_unificator/components/normalization_ui.py b/streamlit_app/app/data_unificator/components/normalization_ui.py
@@ -1,4 +1,4 @@
-# components/normalization_ui.py
+# data_unificator/components/normalization_ui.py
 
 import streamlit as st
 from streamlit.logger import get_logger

diff --git a/streamlit_app/app/data_unificator/components/validation_ui.py b/streamlit_app/app/data_unificator/components/validation_ui.py
@@ -1,4 +1,4 @@
-# components/validation_ui.py
+# data_unificator/components/validation_ui.py
 
 import streamlit as st
 from streamlit.logger import get_logger

diff --git a/streamlit_app/app/data_unificator/config.py b/streamlit_app/app/data_unificator/config.py
@@ -1,4 +1,4 @@
-# config.py
+# data_unificator/config.py
 
 import os
 import yaml

diff --git a/streamlit_app/app/entity_bridge/data_normalizer.py b/streamlit_app/app/entity_bridge/data_normalizer.py
@@ -8,7 +8,7 @@
 
 import pandas as pd
 import streamlit as st
-from utils import (
+from entity_bridge.utils import (
     generate_unique_identifier,
     normalize_text,
     log_normalization_actions

diff --git a/streamlit_app/app/entity_bridge/duplicate_remover.py b/streamlit_app/app/entity_bridge/duplicate_remover.py
@@ -8,7 +8,7 @@
 
 import pandas as pd
 import streamlit as st
-from utils import log_normalization_actions
+from entity_bridge.utils import log_normalization_actions
 
 def identify_duplicates(df, selected_fields):
     """

diff --git a/streamlit_app/app/entity_bridge/entity_matcher.py b/streamlit_app/app/entity_bridge/entity_matcher.py
@@ -9,7 +9,7 @@
 
 import pandas as pd
 import streamlit as st
-from utils import calculate_similarity, generate_unique_identifier
+from entity_bridge.utils import calculate_similarity, generate_unique_identifier
 from collections import defaultdict
 
 

diff --git a/streamlit_app/app/entity_bridge/ui_helper.py b/streamlit_app/app/entity_bridge/ui_helper.py
@@ -29,6 +29,8 @@ def display_file_upload():
 
     if not uploaded_files:
         st.warning("Please upload at least two files to proceed.")
+    else:
+        st.info(f"{len(uploaded_files)} files were uploaded.")
     return uploaded_files
 
 def display_missing_data_options(idx, file_name):

diff --git a/streamlit_app/app/pages/Entity_Bridge.py b/streamlit_app/app/pages/Entity_Bridge.py
@@ -0,0 +1,119 @@
+# Entity_Bridge.py
+
+app_version = "0.1"
+app_title = "OllaLab - Entity Bridge"
+app_description = "Seamlessly merge multiple datasets based on entity names"
+app_icon = ":link:"
+
+import streamlit as st
+from entity_bridge import data_loader
+from entity_bridge import data_normalizer
+from entity_bridge import duplicate_remover
+from entity_bridge import entity_matcher
+from entity_bridge import ui_helper
+
+def process_file(file, idx):
+    """
+    Process a single uploaded file, including loading, handling missing data,
+    field selection, and initial data preparation.
+
+    Args:
+        file (UploadedFile): The file uploaded by the user.
+        idx (int): Index of the file in the list of uploaded files.
+
+    Returns:
+        tuple: A tuple (DataFrame, selected_fields), where DataFrame is the
+        processed DataFrame, and selected_fields is a dictionary of selected fields.
+
+    Side Effects:
+        Displays messages and widgets in the Streamlit UI.
+    """
+    st.header(f"Processing file: {file.name}")
+
+    try:
+        # Load the data
+        df = data_loader.load_data(file)
+        st.success(f"Successfully loaded {file.name}")
+
+        # Display the first few rows of the data
+        st.subheader("Data Preview")
+        st.dataframe(df.head())
+
+        # Handle missing data
+        strategy, default_value, missing_threshold = ui_helper.display_missing_data_options(idx, file.name)
+
+        if strategy == 'remove':
+            df = data_loader.handle_missing_data(df, 'remove')
+        elif strategy == 'fill':
+            df = data_loader.handle_missing_data(df, 'fill', default_value=default_value)
+        elif strategy == 'skip':
+            df = data_loader.handle_missing_data(df, 'skip', missing_threshold=missing_threshold)
+        else:
+            st.error("Invalid strategy selected for handling missing data.")
+            return None, None
+
+        # Field selection
+        selected_fields = ui_helper.display_field_selection(df, file.name, idx)
+
+        if not selected_fields.get('parent_name'):
+            st.error("Parent Name Field is mandatory. Cannot proceed without it.")
+            return None, None
+
+        # Ensure required columns are in the DataFrame
+        required_columns = [field for field in selected_fields.values() if field]
+        df_selected = df[required_columns].copy()
+
+        st.success(f"File {file.name} processed successfully.")
+        return df_selected, selected_fields
+
+    except Exception as e:
+        st.error(f"An error occurred while processing {file.name}: {e}")
+        return None, None
+
+st.title("Entity Bridge")
+st.write("Merge multiple datasets containing entity information with overlapping entities.")
+
+# Step 1: File Upload
+uploaded_files = ui_helper.display_file_upload()
+
+data_frames = []
+
+if uploaded_files and len(uploaded_files) >= 2:
+    # Step 2: Load and preprocess the data files
+    for idx, file in enumerate(uploaded_files):
+        df_selected, selected_fields = process_file(file, idx)
+        if df_selected is not None and selected_fields:
+            data_frames.append((df_selected, selected_fields))
+        else:
+            st.error(f"Failed to process file {file.name}.")
+else:
+    st.warning("Please upload at least two files to proceed.")
+
+if data_frames:
+    st.header("Normalizing Data and Removing Duplicates")
+    # Step 3: Normalize IDs and Names
+    normalized_data_frames = data_normalizer.normalize_data_frames(data_frames)
+
+    # Step 4: Remove Duplicates
+    deduplicated_data_frames = duplicate_remover.remove_duplicates_from_data_frames(normalized_data_frames)
+
+    st.header("Matching Entities and Assigning Unique Identifiers")
+    # Step 5: Construct Unique Parent List
+    unique_parents_df = entity_matcher.construct_unique_parent_list(deduplicated_data_frames)
+
+    # Step 6: Construct Unique Child List
+    unique_children_df = entity_matcher.construct_unique_child_list(deduplicated_data_frames)
+
+    # Step 7: Enrich DataFrames with Unique IDs
+    enriched_data_frames = entity_matcher.enrich_data_frames_with_unique_ids(
+        deduplicated_data_frames, unique_parents_df, unique_children_df
+    )
+
+    # Step 8: Display Enriched DataFrames
+    ui_helper.display_enriched_data(enriched_data_frames)
+
+    # Step 9: Download Enriched DataFrames
+    ui_helper.download_enriched_data(enriched_data_frames)
+
+else:
+    st.warning("Please upload at least two files to proceed.")