Skip to content

Commit

Permalink
Add Entity Bridge app page.
Browse files Browse the repository at this point in the history
  • Loading branch information
Cybonto committed Dec 2, 2024
1 parent bea4ee1 commit ce3a153
Show file tree
Hide file tree
Showing 12 changed files with 131 additions and 10 deletions.
2 changes: 1 addition & 1 deletion streamlit_app/app/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
*.json
!*_promptvariables.json
entity_bridge_promptvariables.json
my*.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# components/deduplication_ui.py
# data_unificator/components/deduplication_ui.py

import streamlit as st
from streamlit.logger import get_logger
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app/app/data_unificator/components/import_ui.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# components/import_ui.py
# data_unificator/components/import_ui.py

import streamlit as st
from streamlit.logger import get_logger
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app/app/data_unificator/components/mapping_ui.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# components/mapping_ui.py
# data_unificator/components/mapping_ui.py

import os
import streamlit as st
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# components/normalization_ui.py
# data_unificator/components/normalization_ui.py

import streamlit as st
from streamlit.logger import get_logger
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# components/validation_ui.py
# data_unificator/components/validation_ui.py

import streamlit as st
from streamlit.logger import get_logger
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app/app/data_unificator/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# config.py
# data_unificator/config.py

import os
import yaml
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app/app/entity_bridge/data_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas as pd
import streamlit as st
from utils import (
from entity_bridge.utils import (
generate_unique_identifier,
normalize_text,
log_normalization_actions
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app/app/entity_bridge/duplicate_remover.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas as pd
import streamlit as st
from utils import log_normalization_actions
from entity_bridge.utils import log_normalization_actions

def identify_duplicates(df, selected_fields):
"""
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app/app/entity_bridge/entity_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import pandas as pd
import streamlit as st
from utils import calculate_similarity, generate_unique_identifier
from entity_bridge.utils import calculate_similarity, generate_unique_identifier
from collections import defaultdict


Expand Down
2 changes: 2 additions & 0 deletions streamlit_app/app/entity_bridge/ui_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def display_file_upload():

if not uploaded_files:
st.warning("Please upload at least two files to proceed.")
else:
st.info(f"{len(uploaded_files)} files were uploaded.")
return uploaded_files

def display_missing_data_options(idx, file_name):
Expand Down
119 changes: 119 additions & 0 deletions streamlit_app/app/pages/Entity_Bridge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Entity_Bridge.py

app_version = "0.1"
app_title = "OllaLab - Entity Bridge"
app_description = "Seamlessly merge multiple datasets based on entity names"
app_icon = ":link:"

import streamlit as st
from entity_bridge import data_loader
from entity_bridge import data_normalizer
from entity_bridge import duplicate_remover
from entity_bridge import entity_matcher
from entity_bridge import ui_helper

def process_file(file, idx):
"""
Process a single uploaded file, including loading, handling missing data,
field selection, and initial data preparation.
Args:
file (UploadedFile): The file uploaded by the user.
idx (int): Index of the file in the list of uploaded files.
Returns:
tuple: A tuple (DataFrame, selected_fields), where DataFrame is the
processed DataFrame, and selected_fields is a dictionary of selected fields.
Side Effects:
Displays messages and widgets in the Streamlit UI.
"""
st.header(f"Processing file: {file.name}")

try:
# Load the data
df = data_loader.load_data(file)
st.success(f"Successfully loaded {file.name}")

# Display the first few rows of the data
st.subheader("Data Preview")
st.dataframe(df.head())

# Handle missing data
strategy, default_value, missing_threshold = ui_helper.display_missing_data_options(idx, file.name)

if strategy == 'remove':
df = data_loader.handle_missing_data(df, 'remove')
elif strategy == 'fill':
df = data_loader.handle_missing_data(df, 'fill', default_value=default_value)
elif strategy == 'skip':
df = data_loader.handle_missing_data(df, 'skip', missing_threshold=missing_threshold)
else:
st.error("Invalid strategy selected for handling missing data.")
return None, None

# Field selection
selected_fields = ui_helper.display_field_selection(df, file.name, idx)

if not selected_fields.get('parent_name'):
st.error("Parent Name Field is mandatory. Cannot proceed without it.")
return None, None

# Ensure required columns are in the DataFrame
required_columns = [field for field in selected_fields.values() if field]
df_selected = df[required_columns].copy()

st.success(f"File {file.name} processed successfully.")
return df_selected, selected_fields

except Exception as e:
st.error(f"An error occurred while processing {file.name}: {e}")
return None, None

st.title("Entity Bridge")
st.write("Merge multiple datasets containing entity information with overlapping entities.")

# Step 1: File Upload
uploaded_files = ui_helper.display_file_upload()

data_frames = []

if uploaded_files and len(uploaded_files) >= 2:
# Step 2: Load and preprocess the data files
for idx, file in enumerate(uploaded_files):
df_selected, selected_fields = process_file(file, idx)
if df_selected is not None and selected_fields:
data_frames.append((df_selected, selected_fields))
else:
st.error(f"Failed to process file {file.name}.")
else:
st.warning("Please upload at least two files to proceed.")

if data_frames:
st.header("Normalizing Data and Removing Duplicates")
# Step 3: Normalize IDs and Names
normalized_data_frames = data_normalizer.normalize_data_frames(data_frames)

# Step 4: Remove Duplicates
deduplicated_data_frames = duplicate_remover.remove_duplicates_from_data_frames(normalized_data_frames)

st.header("Matching Entities and Assigning Unique Identifiers")
# Step 5: Construct Unique Parent List
unique_parents_df = entity_matcher.construct_unique_parent_list(deduplicated_data_frames)

# Step 6: Construct Unique Child List
unique_children_df = entity_matcher.construct_unique_child_list(deduplicated_data_frames)

# Step 7: Enrich DataFrames with Unique IDs
enriched_data_frames = entity_matcher.enrich_data_frames_with_unique_ids(
deduplicated_data_frames, unique_parents_df, unique_children_df
)

# Step 8: Display Enriched DataFrames
ui_helper.display_enriched_data(enriched_data_frames)

# Step 9: Download Enriched DataFrames
ui_helper.download_enriched_data(enriched_data_frames)

else:
st.warning("Please upload at least two files to proceed.")

0 comments on commit ce3a153

Please sign in to comment.