Skip to content

Commit

Permalink
#26 add basic runner
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivieFranklova committed Sep 30, 2024
1 parent 832080a commit 5ea367d
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 0 deletions.
Empty file added similarityRunner/UI/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions similarityRunner/UI/run_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import sys

from models.connector_models import ConnectorSettings
from models.user_models import SimilaritySettings, MetadataSettings
import runner as r

if __name__ == "__main__":
try:
directory = sys.argv[1]
run_type = sys.argv[2] # all, metadata, similarity
settings = SimilaritySettings()
settings.connector = ConnectorSettings(file_type=("csv", "parquet"), files_paths=[], directory_paths=directory)
settings.metadata = MetadataSettings(all=True, kinds=True, types=True, embeddings=True)
settings.run_type = run_type
r.run(settings)
except IndexError:
print("Add path to directory")
14 changes: 14 additions & 0 deletions similarityRunner/interfaces/OutputFormaterInterface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
"""
import abc

class OutputFormaterInterface(metaclass=abc.ABCMeta):
"""
OutputFormaterInterface class is an abstract interface that defines
the methods that must be implemented by the concrete formater classes.
"""

@abc.abstractmethod
def format(self, data: dict):
pass
31 changes: 31 additions & 0 deletions similarityRunner/models/user_models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""
This module contains the user models
"""
from enum import EnumType

from pydantic import BaseModel

from Comparator import Comparator
from ComparatorByColumn import ComparatorByColumn
from models.connector_models import ConnectorSettings


class SimilarityOutput(BaseModel):
"""
Expand All @@ -13,3 +18,29 @@ class SimilarityOutput(BaseModel):
# here will be common fields for all similarity models
table_names: list[str]
distances: dict[(str, str), float]

class MetadataSettings(BaseModel):
"""
MetadataSettings class is a base class for metadata settings.
"""
all: bool
kinds: bool
types: bool
embeddings: bool

class RunType(EnumType):
ALL = "all"
METADATA = "metadata"
SIMILARITY = "similarity"

class ComparatorType(EnumType):
BY_COLUMN = ComparatorByColumn()
BY_TYPE = Comparator()
class SimilaritySettings(BaseModel):
"""
SimilaritySettings class is a base class for similarity settings.
"""
connector: ConnectorSettings
metadata: MetadataSettings
run_type: RunType
comparator_type: ComparatorType
70 changes: 70 additions & 0 deletions similarityRunner/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
This
"""

from Comparator import Comparator
from ComparatorByColumn import ComparatorByColumn, ColumnKindComparator, ColumnExactNamesComparator
from DataFrameMetadata import DataFrameMetadata
from DataFrameMetadataCreator import DataFrameMetadataCreator
from connectors.filesystem_connector import FilesystemConnector
from interfaces.OutputFormaterInterface import OutputFormaterInterface
from models.connector_models import Output
from models.user_models import SimilaritySettings

def create_metadata(settings: SimilaritySettings, data: Output) -> dict[str, DataFrameMetadata]:
"""
Create metadata for each table in the data
"""
dataframes, names = data
df_metadata = {}
if settings.metadata.all:
for df, name in zip(dataframes, names):
df_metadata[name] = (DataFrameMetadataCreator(df)
.create_column_embeddings()
.compute_advanced_structural_types()
.compute_column_kind()
.get_metadata())
else:
... # todo after #35

# todo save metadata after #35
return df_metadata


def __get_comparator(settings: SimilaritySettings):
"""
Get comparator based on settings
"""
if settings.comparator_type == "BY_COLUMN":
comp = ComparatorByColumn()
return comp.add_comparator_type(ColumnKindComparator()).add_comparator_type(ColumnExactNamesComparator())
# todo add by settings #35
return Comparator() # todo #35

def compute_similarity(settings: SimilaritySettings, data: dict[str, DataFrameMetadata]):
"""
Compute similarity between tables
"""
comparator = __get_comparator(settings)
similarity = {}
for name, met in data.items():
for name2, met2 in data.items():
similarity[(name, name2)] = comparator.compare(met, met2)
return similarity

def run(settings: SimilaritySettings):
"""
Run the similarity pipeline
"""
data = FilesystemConnector().get_data(settings.connector)
if settings.run_type == "all":
print("Creating metadata ...")
met = create_metadata(settings, data)
print("Metadata created")
print("Computing similarity ...")
res = compute_similarity(settings, met)
return OutputFormaterInterface().format_output(res)
elif settings.run_type == "metadata":
create_metadata(settings, data)
elif settings.run_type == "similarity":
print("Similarity") # todo after #35

0 comments on commit 5ea367d

Please sign in to comment.