From cebfba309c556092ff8074c7c14a565ea99e5927 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Sat, 7 Dec 2024 10:29:18 +0100 Subject: [PATCH 01/11] Refactor comparator_by_type --- .../impl/comparator/comparator_by_column.py | 25 +- .../src/impl/comparator/comparator_by_type.py | 368 ++++++++++-------- .../test_similarity_comparator.py | 24 +- 3 files changed, 224 insertions(+), 193 deletions(-) diff --git a/similarity_framework/src/impl/comparator/comparator_by_column.py b/similarity_framework/src/impl/comparator/comparator_by_column.py index 605c37c..5527f5a 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_column.py +++ b/similarity_framework/src/impl/comparator/comparator_by_column.py @@ -97,6 +97,8 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, :param metadata2: second dataframe metadata :return: float number 0 or 1 """ + if metadata1.column_names_clean == {} or metadata2.column_names_clean == {}: + logger.warning("Warning: column_names_clean is not computed") return 0 if metadata1.column_names_clean[index1] == metadata2.column_names_clean[index2] else 1 @@ -146,7 +148,7 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, logger.debug( f"column embedding is not computed - [{metadata1.column_embeddings == {}} - {metadata2.column_embeddings == {}}] {index1 if index1 not in metadata1.column_embeddings else index2}" ) - return np.nan + return np.nan #TODO funguje to pro by_column ?? puvodne np.nan return 1 - cosine_sim( metadata1.column_embeddings[index1], metadata2.column_embeddings[index2], @@ -342,20 +344,22 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, """ are_nulls = (False, 0.0) - if DataKind.BOOL in self.compare_kind: + if DataKind.BOOL in self.compare_kind and DataKind.BOOL in metadata1.column_kind and DataKind.BOOL in metadata2.column_kind: if index1 in metadata1.column_kind[DataKind.BOOL] and index2 in metadata2.column_kind[DataKind.BOOL]: return self.compare_bools(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) are_nulls = are_columns_null(metadata1.column_kind[DataKind.BOOL], metadata2.column_kind[DataKind.BOOL], "Boolean column") - if DataKind.ID in self.compare_kind: + + if DataKind.ID in self.compare_kind and DataKind.ID in metadata1.column_kind and DataKind.ID in metadata2.column_kind: if index1 in metadata1.column_kind[DataKind.ID] and index2 in metadata2.column_kind[DataKind.ID]: return self.compare_ids(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) are_nulls = are_columns_null(metadata1.column_kind[DataKind.ID], metadata2.column_kind[DataKind.ID], "ID column") - if DataKind.CATEGORICAL in self.compare_kind: + + if DataKind.CATEGORICAL in self.compare_kind and DataKind.CATEGORICAL in metadata1.column_kind and DataKind.CATEGORICAL in metadata2.column_kind: if index1 in metadata1.column_kind[DataKind.CATEGORICAL] and index2 in metadata2.column_kind[DataKind.CATEGORICAL]: return self.compare_categoricals(metadata1.categorical_metadata[index1], metadata2.categorical_metadata[index2]) are_nulls = are_columns_null(metadata1.column_kind[DataKind.CATEGORICAL], metadata2.column_kind[DataKind.CATEGORICAL], "Categorical column") - if DataKind.CONSTANT in self.compare_kind: + if DataKind.CONSTANT in self.compare_kind and DataKind.CONSTANT in metadata1.column_kind and DataKind.CONSTANT in metadata2.column_kind: if index1 in metadata1.column_kind[DataKind.CONSTANT] and index2 in metadata2.column_kind[DataKind.CONSTANT]: return self.compare_constants(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) @@ -369,11 +373,10 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, class ColumnTypeHandler(SpecificColumnHandler): def __numerical_compare1( - self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, column1_type: type[Type], column2_type: type[Type] + self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int ) -> float: num_met1 = metadata1.numerical_metadata[index1] num_met2 = metadata2.numerical_metadata[index2] - score = 3 if column1_type == column2_type else 0 if num_met1.same_value_length == num_met2.same_value_length: score += 2 if num_met1.min_value == num_met2.min_value: @@ -389,11 +392,10 @@ def __numerical_compare1( return 1 - score / 9 def __nonnumerical_compare1( - self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, column1_type: type[Type], column2_type: type[Type] + self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int ) -> float: num_met1 = metadata1.nonnumerical_metadata[index1] num_met2 = metadata2.nonnumerical_metadata[index2] - score = 3 if column1_type == column2_type else 0 if num_met1.longest == num_met2.longest or num_met1.longest is num_met2.longest: score += 2 if num_met1.shortest == num_met2.shortest or num_met1.shortest is num_met2.shortest: @@ -415,11 +417,12 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, """ column1_type = metadata1.get_column_type(index1) column2_type = metadata2.get_column_type(index2) + score = 3 if column1_type == column2_type else 0 if index1 in metadata1.numerical_metadata and index2 in metadata2.numerical_metadata: - return self.__numerical_compare1(metadata1, metadata2, index1, index2, column1_type, column2_type) + return self.__numerical_compare1(metadata1, metadata2, index1, index2, score) if index1 in metadata1.nonnumerical_metadata and index2 in metadata2.nonnumerical_metadata: - return self.__nonnumerical_compare1(metadata1, metadata2, index1, index2, column1_type, column2_type) + return self.__nonnumerical_compare1(metadata1, metadata2, index1, index2, score) if column1_type == column2_type: return 0 diff --git a/similarity_framework/src/impl/comparator/comparator_by_type.py b/similarity_framework/src/impl/comparator/comparator_by_type.py index 664fbca..c37e0a4 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_type.py +++ b/similarity_framework/src/impl/comparator/comparator_by_type.py @@ -8,13 +8,15 @@ from torch import Tensor from logging_ import logger +from similarity_framework.src.impl.comparator.comparator_by_column import ColumnTypeHandler, IncompleteColumnsHandler, ColumnExactNamesHandler, \ + ColumnNamesEmbeddingsHandler, ColumnEmbeddingsHandler, SizeHandler, ColumnKindHandler from similarity_framework.src.impl.comparator.utils import cosine_sim, get_ratio, concat, fill_result from similarity_framework.src.interfaces.common import DistanceFunction -from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin +from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin, AverageDist from similarity_framework.src.interfaces.comparator.comparator import HandlerType, Comparator from similarity_framework.src.models.metadata import Metadata from similarity_framework.src.models.similarity import SimilarityOutput, Settings -from similarity_framework.src.models.types_ import DataKind +from similarity_framework.src.models.types_ import DataKind, Type from similarity_framework.src.models.settings import AnalysisSettings @@ -145,110 +147,110 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data return concat(result, name_distance) -class ColumnEmbeddingHandler(HandlerType): - """ - Handler for column values embeddings - """ - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: - """ - Compare embeddings of columns - :param metadata1: first table - :param metadata2: second table - :return: dataframe full of numbers between 0 and 1 - """ - result = pd.DataFrame() - name_distance = pd.DataFrame() - for id1, ( - column1, - embedding1, - ) in enumerate(metadata1.column_embeddings.items()): - for id2, ( - column2, - embedding2, - ) in enumerate(metadata2.column_embeddings.items()): - result.loc[id1, id2] = 1 - cosine_sim(embedding1, embedding2) - name_distance.loc[id1, id2] = 1 - cosine_sim( - metadata1.column_embeddings[column1], - metadata2.column_embeddings[column2], - ) - return concat(result, name_distance) - - -class SizeHandler(HandlerType): - """ - Size of table Handler class - """ - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: - """ - If sizes are the same distance is 0, else distance is 1 - % of max - :param metadata1: first table - :param metadata2: second table - :return: dataframe of size 1x1 fill with distance number (0-1) # todo test - """ - max_size = int(max(metadata1.size, metadata2.size)) - min_size = int(min(metadata1.size, metadata2.size)) - distance = 1 - (min_size / max_size) - return pd.DataFrame(index=range(1), columns=range(1)).fillna(distance) - # todo if this is not working try this We will fill the whole table with this numer, distance function should compute the same number (todo test) - - -class ColumnExactNamesHandler(HandlerType): - """ - Handler for exact column names - """ - - def compare(self, metadata1: Metadata, metadata2: Metadata, *kwargs) -> pd.DataFrame: - """ - This is dummy Handler if the names are exactly the same distance is 0 if not distance is 1 - :param metadata1: first table - :param metadata2: second table - :return: dataframe fill by 0 and 1 - """ - if metadata1.column_names_clean == {} or metadata2.column_names_clean == {}: - logger.warning("Warning: column_names_clean is not computed") - return fill_result(metadata1.column_names_clean, metadata2.column_names_clean) - - -class ColumnNamesEmbeddingsHandler(HandlerType): - """ - Handler for column names embeddings - """ - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: - """ - Computes cosine distance for each column name embedding - :param distance_function: - not used - :param metadata1: first table - :param metadata2: second table - :param settings: - not used - :return: dataframe fill by distances between 0 and 1 - """ - if metadata1.column_name_embeddings == {} or metadata2.column_name_embeddings == {}: - logger.warning("Warning: column name embedding is not computed") - - result = pd.DataFrame() - for idx1, name1 in enumerate(metadata1.column_name_embeddings.values()): - for idx2, name2 in enumerate(metadata2.column_name_embeddings.values()): - result.loc[idx1, idx2] = 1 - cosine_sim(name1, name2) - return result - - -class IncompleteColumnsHandler(HandlerType): - """ - Handler for incomplete columns - """ - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: - """ - Compare if two columns are complete or incomplete, if both have same outcome (True False) - the distance is 0 otherwise is 1 - :param metadata1: first table - :param metadata2: second table - :return: dataframe full of 1 and 0 - """ - return fill_result(metadata1.column_incomplete, metadata2.column_incomplete) +# class ColumnEmbeddingHandler(HandlerType): +# """ +# Handler for column values embeddings +# """ +# +# def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: +# """ +# Compare embeddings of columns +# :param metadata1: first table +# :param metadata2: second table +# :return: dataframe full of numbers between 0 and 1 +# """ +# result = pd.DataFrame() +# name_distance = pd.DataFrame() +# for id1, ( +# column1, +# embedding1, +# ) in enumerate(metadata1.column_embeddings.items()): +# for id2, ( +# column2, +# embedding2, +# ) in enumerate(metadata2.column_embeddings.items()): +# result.loc[id1, id2] = 1 - cosine_sim(embedding1, embedding2) +# name_distance.loc[id1, id2] = 1 - cosine_sim( +# metadata1.column_embeddings[column1], +# metadata2.column_embeddings[column2], +# ) +# return concat(result, name_distance) + + +# class SizeHandler(HandlerType): +# """ +# Size of table Handler class +# """ +# +# def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: +# """ +# If sizes are the same distance is 0, else distance is 1 - % of max +# :param metadata1: first table +# :param metadata2: second table +# :return: dataframe of size 1x1 fill with distance number (0-1) # todo test +# """ +# max_size = int(max(metadata1.size, metadata2.size)) +# min_size = int(min(metadata1.size, metadata2.size)) +# distance = 1 - (min_size / max_size) +# return pd.DataFrame(index=range(1), columns=range(1)).fillna(distance) +# # todo if this is not working try this We will fill the whole table with this numer, distance function should compute the same number (todo test) + + +# class ColumnExactNamesHandler(HandlerType): +# """ +# Handler for exact column names +# """ +# +# def compare(self, metadata1: Metadata, metadata2: Metadata, *kwargs) -> pd.DataFrame: +# """ +# This is dummy Handler if the names are exactly the same distance is 0 if not distance is 1 +# :param metadata1: first table +# :param metadata2: second table +# :return: dataframe fill by 0 and 1 +# """ +# if metadata1.column_names_clean == {} or metadata2.column_names_clean == {}: +# logger.warning("Warning: column_names_clean is not computed") +# return fill_result(metadata1.column_names_clean, metadata2.column_names_clean) + + +# class ColumnNamesEmbeddingsHandler(HandlerType): +# """ +# Handler for column names embeddings +# """ +# +# def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: +# """ +# Computes cosine distance for each column name embedding +# :param distance_function: - not used +# :param metadata1: first table +# :param metadata2: second table +# :param settings: - not used +# :return: dataframe fill by distances between 0 and 1 +# """ +# if metadata1.column_name_embeddings == {} or metadata2.column_name_embeddings == {}: +# logger.warning("Warning: column name embedding is not computed") +# +# result = pd.DataFrame() +# for idx1, name1 in enumerate(metadata1.column_name_embeddings.values()): +# for idx2, name2 in enumerate(metadata2.column_name_embeddings.values()): +# result.loc[idx1, idx2] = 1 - cosine_sim(name1, name2) +# return result + + +# class IncompleteColumnsHandler(HandlerType):# todo thsi ok +# """ +# Handler for incomplete columns +# """ +# +# def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: +# """ +# Compare if two columns are complete or incomplete, if both have same outcome (True False) +# the distance is 0 otherwise is 1 +# :param metadata1: first table +# :param metadata2: second table +# :return: dataframe full of 1 and 0 +# """ +# return fill_result(metadata1.column_incomplete, metadata2.column_incomplete) ## class KindHandler(HandlerType): @@ -509,51 +511,6 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data return pd.DataFrame([result]) -class TypeHandler(HandlerType): - - def __numerical_compare1(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str): - num_met1 = metadata1.numerical_metadata[index1] - num_met2 = metadata2.numerical_metadata[index2] - score = 3 - if num_met1.same_value_length == num_met2.same_value_length: - score += 2 - if num_met1.min_value == num_met2.min_value: - score += 1 - elif num_met1.min_value == num_met2.min_value + num_met1.range_size / 100 or num_met1.max_value == num_met2.max_value - num_met1.range_size / 100: - score += 0.5 - if num_met1.max_value == num_met2.max_value: - score += 1 - elif num_met1.max_value == num_met2.max_value - num_met1.range_size / 100 or num_met1.max_value == num_met2.max_value + num_met1.range_size / 100: - score += 0.5 - if num_met1.range_size == num_met2.range_size: - score += 2 - return 1 - score / 9 - - def __nonnumerical_compare1(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: - num_met1 = metadata1.nonnumerical_metadata[index1] - num_met2 = metadata2.nonnumerical_metadata[index2] - score = 3 - if num_met1.longest == num_met2.longest: - score += 2 - if num_met1.shortest == num_met2.shortest: - score += 2 - if num_met1.avg_length == num_met2.avg_length: - score += 2 - elif num_met1.avg_length == num_met2.avg_length + num_met1.avg_length / 100 or num_met1.avg_length == num_met2.avg_length - num_met1.avg_length / 100: - score += 1 - return 1 - score / 9 - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame | float: - result = pd.DataFrame() - for type_ in metadata1.column_type: - for idx1, name1 in enumerate(metadata1.column_type[type_]): - for idx2, name2 in enumerate(metadata2.column_type[type_]): - if name1 in metadata1.numerical_metadata and name2 in metadata2.numerical_metadata: - result.loc[idx1, idx2] = self.__numerical_compare1(metadata1, metadata2, name1, name2) - - if name1 in metadata1.nonnumerical_metadata and name2 in metadata2.nonnumerical_metadata: - result.loc[idx1, idx2] = self.__nonnumerical_compare1(metadata1, metadata2, name1, name2) - return result class ComparatorByType(Comparator): @@ -573,15 +530,42 @@ def from_settings(settings: AnalysisSettings) -> "ComparatorByType": if settings.column_name_embeddings: comparator.add_comparator_type(ColumnNamesEmbeddingsHandler(settings.weights.column_name_embeddings)) if settings.column_embeddings: - comparator.add_comparator_type(ColumnEmbeddingHandler(settings.weights.column_embeddings)) + comparator.add_comparator_type(ColumnEmbeddingsHandler(settings.weights.column_embeddings)) if settings.kinds: - comparator.add_comparator_type(CategoricalHandler(settings.weights.kinds)) + comparator.set_kinds(True) + comparator.kind_weight = settings.weights.kinds if settings.type_basic or settings.type_structural or settings.type_advanced: - comparator.add_comparator_type(TypeHandler(settings.weights.type)) + comparator.set_types(True) + comparator.type_weight = settings.weights.type + if settings.distance_function: + func = HausdorffDistanceMin() if settings.distance_function == "HausdorffDistanceMin" else AverageDist() + comparator.set_distance_function(func) logger.info("Comparator by type created") logger.info(f"Handlers used: {','.join([item.__class__.__name__ for item in comparator.comparator_type])}") return comparator + def __init__(self): + super().__init__() + self.kinds = False + self.types = False + self.kinds_compare = True + self.types_compare = True + self.kind_weight = 1 + self.type_weight = 1 + def set_kinds(self, value: bool) -> "ComparatorByType": + """ + Set if kinds should be compared + """ + self.kinds = value + return self + + def set_types(self, value: bool) -> "ComparatorByType": + """ + Set if types should be compared + """ + self.types = value + return self + def add_comparator_type(self, comparator: HandlerType) -> "ComparatorByType": """ Add comparator @@ -589,36 +573,78 @@ def add_comparator_type(self, comparator: HandlerType) -> "ComparatorByType": self.comparator_type.append(comparator) return self + def __compare_all_columns(self, metadata1: Metadata, metadata2: Metadata, + column_names1: set[str], column_names2: set[str], + comparators: list[HandlerType]) -> pd.DataFrame: + all_compares = [] + for comparator in comparators: + col_to_col = pd.DataFrame() + for idx1, name1 in enumerate(column_names1): + for idx2, name2 in enumerate(column_names2): + result = comparator.compare(metadata1, metadata2, index1=name1, index2=name2) + if result is not np.nan: + col_to_col.loc[idx1, idx2] = result + if not col_to_col.empty: all_compares.append(col_to_col) # todo add , comparator.weight + return pd.DataFrame if all_compares == [] else concat(*all_compares) + + def __compare_types(self, type_, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: + comparators = self.comparator_type.copy() + if self.types_compare: comparators.append(ColumnTypeHandler()) + all_compares = self.__compare_all_columns(metadata1, metadata2, + metadata1.column_type[type_], + metadata2.column_type[type_], + comparators) + return all_compares + + def __compare_kinds(self, kind, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: + comparators = self.comparator_type.copy() + if self.kinds_compare: comparators.append(ColumnKindHandler()) + all_compares = self.__compare_all_columns(metadata1, metadata2, + metadata1.column_kind[kind], + metadata2.column_kind[kind], + comparators) + return all_compares + def _compare(self, metadata1: Metadata, metadata2: Metadata) -> SimilarityOutput: """ Compare two tables according to previously set properties. """ distances = [] - for comp in self.comparator_type: - distance_table = comp.compare( - metadata1, - metadata2, - ) - distances.append( - ( - self.distance_function.compute(distance_table), - get_ratio( - distance_table.shape[0], - distance_table.shape[1], - ), - comp.weight, - ) - ) + if self.types: + for type_ in metadata1.column_type.keys(): + if metadata1.column_type[type_] == set() or metadata2.column_type[type_] == set(): + continue + dist_table = self.__compare_types(type_, metadata1, metadata2) + if not dist_table.empty: + distances.append((self.distance_function.compute(dist_table), + get_ratio( + dist_table.shape[0], + dist_table.shape[1], + ), + self.type_weight)) + if self.kinds: + for kind in metadata1.column_kind.keys(): + if metadata1.column_kind[kind] != () and metadata2.column_kind[kind] != (): + dist_table = self.__compare_kinds(kind, metadata1, metadata2) + if not dist_table.empty: + distances.append((self.distance_function.compute(dist_table), + get_ratio( + dist_table.shape[0], + dist_table.shape[1], + ), + self.kind_weight)) + result = 0 nan = 0 + sum_weight = sum([weight for _,_, weight in distances if not np.isnan(weight)]) for dist, ratio, weight in distances: if math.isnan(dist): nan += 1 continue if Settings.NO_RATIO in self.settings: - result += dist * dist * weight + result += dist * dist * weight/sum_weight else: - result += dist * dist * ratio * weight + result += dist * dist * ratio * weight/sum_weight if nan == len(distances): return SimilarityOutput(distance=1) return SimilarityOutput(distance=np.sqrt(result)) diff --git a/tests/similarity_framework/test_similarity_comparator.py b/tests/similarity_framework/test_similarity_comparator.py index 1a3e130..c56c296 100644 --- a/tests/similarity_framework/test_similarity_comparator.py +++ b/tests/similarity_framework/test_similarity_comparator.py @@ -123,7 +123,7 @@ def test_both_columns_non_empty(self): class TestSingleSpecificComparator(unittest.TestCase): def setUp(self): - self.compartor = ComparatorByType() + self.compartor = ComparatorByType().set_types(True).set_kinds(True) self.file = os.path.join(THIS_DIR, '../data_validation/edge_cases.csv') self.data = pd.read_csv(self.file) @@ -145,6 +145,9 @@ def setUp(self): self.metadata_first_half = self.metadata_creator.get_metadata(MetadataCreatorInput(dataframe=self.data_first_half)) self.metadata_second_half = self.metadata_creator.get_metadata(MetadataCreatorInput(dataframe=self.data_second_half)) + self.compartor.types_compare = False + self.compartor.kinds_compare = False + def test_size_compare(self): self.compartor.add_comparator_type(SizeHandler()) @@ -184,7 +187,9 @@ def test_embeddings_names_compare(self): self.assertEqual(self.compartor.compare(self.metadata1, self.metadata1).distance, 0) def test_kind_compare(self): - self.compartor.add_comparator_type(KindHandler()) + self.compartor.types_compare = True + self.compartor.kinds_compare = True + self.compartor.add_comparator_type(ColumnKindHandler()) self.assertEqual(self.compartor.compare(self.metadata1, self.metadata1).distance, 0) self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0) @@ -195,27 +200,24 @@ def test_kind_compare(self): # self.assertEqual(self.compartor.compare(self.metadata_first_half, self.metadata_second_half), 0) def test_kind_BOOL_compare(self): - self.compartor.add_comparator_type(KindHandler(compare_kind=[DataKind.BOOL])) + self.compartor.set_types(False) + self.compartor.add_comparator_type(ColumnKindHandler(compare_kind=[DataKind.BOOL])) self.assertEqual( self.compartor.compare(self.metadata1, self.metadata1).distance, 0) self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0) self.assertEqual(self.compartor.compare(self.metadata_first_half, self.metadata_second_half).distance, 0) def test_kind_ID_compare(self): - self.compartor.add_comparator_type(KindHandler(compare_kind=[DataKind.ID])) + self.compartor.set_types(False) + self.compartor.add_comparator_type(ColumnKindHandler(compare_kind=[DataKind.ID])) self.assertEqual( self.compartor.compare(self.metadata1, self.metadata1).distance, 0) self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0) - def test_kind_CATEGORICAL_compare(self): - self.compartor.add_comparator_type(KindHandler(compare_kind=[DataKind.CATEGORICAL])) - self.assertEqual( - self.compartor.compare(self.metadata1, self.metadata1).distance, 0) - self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0) - self.assertEqual(self.compartor.compare(self.metadata_first_half, self.metadata_second_half).distance, 0) def test_kind_CONSTANT_compare(self): - self.compartor.add_comparator_type(KindHandler(compare_kind=[DataKind.CONSTANT])) + self.compartor.set_types(False) + self.compartor.add_comparator_type(ColumnKindHandler(compare_kind=[DataKind.CONSTANT])) self.assertEqual( self.compartor.compare(self.metadata1, self.metadata1).distance, 0) self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0) From 3e2d4a142ae02d2f4a0bb8c33ced226d849373bf Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Sat, 7 Dec 2024 10:29:31 +0100 Subject: [PATCH 02/11] Update UI --- similarity_runner/src/impl/cli.py | 2 +- similarity_runner/src/interfaces/ui.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/similarity_runner/src/impl/cli.py b/similarity_runner/src/impl/cli.py index a14c01d..45f4da2 100644 --- a/similarity_runner/src/impl/cli.py +++ b/similarity_runner/src/impl/cli.py @@ -34,7 +34,7 @@ def show(self, result: dict[tuple[str, str], SimilarityOutput], settings: Analys def _load_user_input(self) -> Any: parser = argparse.ArgumentParser( - prog="SimilarityRunner CLI", + prog="SimilarityRunner", description="This is a CLI for interaction with similarity-framework, which is a framework for comparing data", ) parser.add_argument( diff --git a/similarity_runner/src/interfaces/ui.py b/similarity_runner/src/interfaces/ui.py index 2fe309e..5184581 100644 --- a/similarity_runner/src/interfaces/ui.py +++ b/similarity_runner/src/interfaces/ui.py @@ -24,8 +24,8 @@ def show(self, result: dict[tuple[str, str], SimilarityOutput], settings: Analys pass def run(self): - something = self._load_user_input() - metadata_input, comparator, metadata_creator, analysis_settings = self._parse_input(something) + input_ = self._load_user_input() + metadata_input, comparator, metadata_creator, analysis_settings = self._parse_input(input_) logger.debug("Analysis settings: ") logger.debug(json.dumps(analysis_settings.model_dump(), indent=4)) logger.info(f"Metadata input has {len(metadata_input)} elements") @@ -43,3 +43,8 @@ def run(self): result[(first.name, second.name)] = comparator.compare(first, second, analysis_settings) # TODO: based on analysis settings get specified metadata objects self.show(result, analysis_settings) + + + + + From 3cb4aa55d13bf562e5b3f5c9267dedac6017b4f6 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Mon, 9 Dec 2024 12:42:36 +0100 Subject: [PATCH 03/11] Move handlers to their own class --- .../impl/comparator/comparator_by_column.py | 425 +-------- .../src/impl/comparator/comparator_by_type.py | 493 ----------- .../src/impl/comparator/handlers.py | 815 ++++++++++++++++++ .../src/interfaces/comparator/comparator.py | 20 +- .../src/interfaces/comparator/handler.py | 23 + 5 files changed, 843 insertions(+), 933 deletions(-) create mode 100644 similarity_framework/src/impl/comparator/handlers.py create mode 100644 similarity_framework/src/interfaces/comparator/handler.py diff --git a/similarity_framework/src/impl/comparator/comparator_by_column.py b/similarity_framework/src/impl/comparator/comparator_by_column.py index 5527f5a..36bf2e8 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_column.py +++ b/similarity_framework/src/impl/comparator/comparator_by_column.py @@ -1,433 +1,16 @@ -import logging -from abc import abstractmethod, ABC -from importlib.metadata import metadata - import numpy as np import pandas as pd -from statistics import mean -from logging_ import logger + from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin, AverageDist -from similarity_framework.src.impl.comparator.utils import cosine_sim, are_columns_null +from similarity_framework.src.impl.comparator.handlers import SizeHandler, IncompleteColumnsHandler, ColumnExactNamesHandler, ColumnNamesEmbeddingsHandler, \ + ColumnEmbeddingsHandler, ColumnKindHandler, ColumnTypeHandler, TableHandler from similarity_framework.src.interfaces.comparator.comparator import HandlerType, Comparator -from similarity_framework.src.models.metadata import Metadata, KindMetadata, CategoricalMetadata +from similarity_framework.src.models.metadata import Metadata from similarity_framework.src.models.similarity import SimilarityOutput -from similarity_framework.src.models.types_ import DataKind, Type from similarity_framework.src.models.settings import AnalysisSettings -class BasicHandler(HandlerType): - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: - if "index1" not in kwargs or "index2" not in kwargs: - raise RuntimeError(f"Handler didnt have sufficient arguments - index1 and index2 - {kwargs}") - return self._inner_compare(metadata1, metadata2, kwargs["index1"], kwargs["index2"]) - - @abstractmethod - def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> pd.DataFrame: - pass - - -class TableHandler(HandlerType, ABC): - """ - Abstract class for table handlers it should compare features of whole table - """ - - -class GeneralColumnHandler(BasicHandler, ABC): - """ - Handler for simple comparison - """ - - -class SpecificColumnHandler(BasicHandler, ABC): - """ - Handler for advanced comparison - """ - - -class SizeHandler(TableHandler): - """ - Handler of size of two tables - """ - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> float: - """ - Compare the size of the two dataframes. If sizes are the same distance is 0, else distance is 1 - % of max size. - :param index1: in this case is not used - :param index2: in this case it not used - :param metadata1: first dataframe metadata - :param metadata2: second dataframe metadata - :return: float number in range <0, 1> - """ - max_size = int(max(metadata1.size, metadata2.size)) - min_size = int(min(metadata1.size, metadata2.size)) - return 1 - (min_size / max_size) - - -class IncompleteColumnsHandler(GeneralColumnHandler): - """ - Handler for incomplete columns - """ - - def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: - """ - Compare if two columns are complete or incomplete. If both are complete, - or both are incomplete distance is 0, else distance is 1 - :param index2: name or id of column in metadata2 - :param index1: name or id of column in metadata1 - :param metadata1: first dataframe metadata - :param metadata2: second dataframe metadata - :return: float number 0 or 1 - """ - return 0 if metadata1.column_incomplete[index1] == metadata2.column_incomplete[index2] else 1 - - -class ColumnExactNamesHandler(GeneralColumnHandler): - """ - Handler for exact column names - """ - - def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: - """ - Compare if two columns have the same name. If both have the same name distance is 0, else distance is 1. - :param index2: name or id of column in metadata2 - :param index1: name or id of column in metadata1 - :param metadata1: first dataframe metadata - :param metadata2: second dataframe metadata - :return: float number 0 or 1 - """ - if metadata1.column_names_clean == {} or metadata2.column_names_clean == {}: - logger.warning("Warning: column_names_clean is not computed") - return 0 if metadata1.column_names_clean[index1] == metadata2.column_names_clean[index2] else 1 - - -class ColumnNamesEmbeddingsHandler(GeneralColumnHandler): - """ - Handler for column names embeddings - """ - - def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: - """ - Compare if two columns have similar name. Computes cosine distance for embeddings - :param index2: name or id of column in metadata2 - :param index1: name or id of column in metadata1 - :param metadata1: first dataframe metadata - :param metadata2: second dataframe metadata - :return: float number in range <0, 1> 0 exactly the same 1 completely different - """ - if metadata1.column_name_embeddings == {} or metadata2.column_name_embeddings == {}: - logging.warning("Warning: column name embedding is not computed") - return 1 - return 1 - cosine_sim( - metadata1.column_name_embeddings[index1], - metadata2.column_name_embeddings[index2], - ) - - -class ColumnEmbeddingsHandler(GeneralColumnHandler): - """ - Handler for column values embeddings - """ - - def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: - """ - Compare embeddings for two columns. Computes cosine distance for embeddings. - :param index2: name or id of column in metadata2 - :param index1: name or id of column in metadata1 - :param metadata1: first dataframe metadata - :param metadata2: second dataframe metadata - :return: float number in range <0, 1> 0 exactly the same 1 completely different - """ - if ( - metadata1.column_embeddings == {} - or metadata2.column_embeddings == {} - or index1 not in metadata1.column_embeddings - or index2 not in metadata2.column_embeddings - ): - logger.debug( - f"column embedding is not computed - [{metadata1.column_embeddings == {}} - {metadata2.column_embeddings == {}}] {index1 if index1 not in metadata1.column_embeddings else index2}" - ) - return np.nan #TODO funguje to pro by_column ?? puvodne np.nan - return 1 - cosine_sim( - metadata1.column_embeddings[index1], - metadata2.column_embeddings[index2], - ) - - -class ColumnKindHandler(SpecificColumnHandler): - """ - Handler for column kind - """ - - def __init__(self, compare_kind=None, weight=1): - """ - Constructor for ColumnKindHandler, sets which kinds should be compared and weight for each kind - """ - super().__init__(weight=weight) - if compare_kind is None: - self.compare_kind = [ - DataKind.BOOL, - DataKind.ID, - DataKind.CATEGORICAL, - DataKind.CONSTANT, - ] - else: - self.compare_kind = compare_kind - if weight is None: - self.kind_weight: dict = {DataKind.BOOL: 1, DataKind.ID: 1, DataKind.CATEGORICAL: 1, DataKind.CONSTANT: 1} - else: - self.kind_weight = weight - - def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # todo add type - """ - Creates table of distances between embeddings for each row and computes mean - of row and column minimums then pick max. - :param embeddings1: values for column1 - :param embeddings2: values for column2 - :return: float from 0 to 1 - """ - # alternative version - # res = pd.DataFrame() - # row_mins = [] - # for id1, embed1 in enumerate(embeddings1): - # for id2, embed2 in enumerate(embeddings2): - # res.loc[id1, id2] = 1 - cosine_sim(embed1, embed2) - # row_mins.append(res.loc[id1].min()) - # column_mins = [] - # for _, column in res.items(): - # column_mins.append(min(column)) - # return max([mean(row_mins), mean(column_mins)]) - - similarity_matrix = [[1 - cosine_sim(embed1, embed2) for embed2 in embeddings2] for embed1 in embeddings1] - res = pd.DataFrame(similarity_matrix) - row_mins = res.min(axis=1).tolist() - column_mins = res.min(axis=0).tolist() - return max(mean(row_mins), mean(column_mins)) - # todo vysvetlit v textu - - def compare_bools( - self, - metadata1: KindMetadata, - metadata2: KindMetadata, - ) -> float: - """ - Compare two boolean columns. Compare if they have the same distribution of True and False values. - Compare if they contain nulls. - Compare embeddings of values. - Make an average of these values. - :param metadata1: for column1 - :param metadata2: for column2 - :return: float number in range <0, 1> - """ - nulls = 0 if metadata1.nulls == metadata2.nulls else 1 - distr = abs(metadata1.distribution[0] / metadata1.distribution[1] - metadata2.distribution[0] / metadata2.distribution[1]) - if metadata1.value_embeddings is None or metadata2.value_embeddings is None: - return (nulls + distr) / 2 - return ( - nulls - + distr - + self.compute_embeddings_distance( - metadata1.value_embeddings, - metadata2.value_embeddings, - ) - ) / 3 - - def compare_categoricals( - self, - metadata1: CategoricalMetadata, - metadata2: CategoricalMetadata, - ) -> float: - """ - Compare two categorical columns. Compare if they contain nulls. - Compare embeddings of values. - Make an average of these values. - :param metadata1: for column1 - :param metadata2: for column2 - :return: float number in range <0, 1> - """ - value_re = self.compute_embeddings_distance( - metadata1.category_embedding, - metadata2.category_embedding, - ) - count1 = metadata1.count_categories - count2 = metadata2.count_categories - count_re = 1 - count1 / count2 if count1 < count2 else 1 - count2 / count1 - # todo compare categories_with_count for metadata1 and metadata2 - # firstly normalize dictionary categories_with_count then - # compare the difference between the two dictionaries - return (value_re + count_re) / 2 - - def compare_constants( - self, - metadata1: KindMetadata, - metadata2: KindMetadata, - ) -> float: - """ - Compare two constant columns. Compare if they contain nulls. - Compare embeddings of values. - Make an average of these values. - :param metadata1: for column1 - :param metadata2: for column2 - :return: float number in range <0, 1> - """ - nulls = 0 if metadata1.nulls == metadata2.nulls else 1 - if metadata1.value_embeddings is None or metadata2.value_embeddings is None: - value: float = 0 if metadata1.value == metadata2.value else 1 - else: - value = 1 - cosine_sim( - metadata1.value_embeddings[0], - metadata2.value_embeddings[0], - ) - # if nulls are equal and exist - if nulls == 0 and metadata1.nulls: - ratio1 = metadata1.distribution[0] / metadata1.distribution[1] - ratio2 = metadata2.distribution[0] / metadata2.distribution[1] - nulls = abs(ratio1 - ratio2) # compute difference between distribution - return (nulls + value) / 2 - - def compare_ids( - self, - metadata1: KindMetadata, - metadata2: KindMetadata, - ) -> float: - """ - Compare two id columns. Compare if they contain nulls. - Compare embeddings of values. - Compare ratio of max length. - Make an average of these values. - :return: float number in range <0, 1> - """ - embeddings1_longest = metadata1.longest_embeddings - embeddings2_longest = metadata2.longest_embeddings - embeddings1_shortest = metadata1.shortest_embeddings - embeddings2_shortest = metadata2.shortest_embeddings - - if embeddings1_longest is not None and embeddings2_longest is not None: - value_long_re = 1 - cosine_sim( - embeddings1_longest, - embeddings2_longest, - ) - else: - value_long_re = 0 if metadata1.longest == metadata2.longest else 1 - if embeddings1_shortest is not None and embeddings2_shortest is not None: - value_short_re = 1 - cosine_sim( - embeddings1_shortest, - embeddings2_shortest, - ) - else: - value_short_re = 0 if metadata1.shortest == metadata2.shortest else 1 - - nulls_re = 0 if metadata1.nulls == metadata2.nulls else 1 - ratio_max_re = abs(metadata1.ratio_max_length - metadata2.ratio_max_length) - return (value_short_re + value_long_re + nulls_re + ratio_max_re) / 4 - - def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: - """ - Compare if two columns have the same kind. If both have the same kind distance is 0, else distance is 1. - :param index2: name or id of column in metadata2 - :param index1: name or id of column in metadata1 - :param metadata1: first dataframe metadata - :param metadata2: second dataframe metadata - :return: float number 0 or 1 - - data_kinds = [DataKind.BOOL, DataKind.ID, DataKind.CATEGORICAL, DataKind.CONSTANT] - compare_methods = [self.compare_bools, self.compare_ids, self.compare_categoricals, self.compare_constants] - - for kind, method in zip(data_kinds, compare_methods): - if kind in self.compare_kind: - if index1 in metadata1.column_kind[kind] and index2 in metadata2.column_kind[kind]: - return method() - if index1 in metadata1.column_kind[kind] or index2 in metadata2.column_kind[kind]: - return 1 - return np.nan - - """ - are_nulls = (False, 0.0) - if DataKind.BOOL in self.compare_kind and DataKind.BOOL in metadata1.column_kind and DataKind.BOOL in metadata2.column_kind: - if index1 in metadata1.column_kind[DataKind.BOOL] and index2 in metadata2.column_kind[DataKind.BOOL]: - return self.compare_bools(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) - are_nulls = are_columns_null(metadata1.column_kind[DataKind.BOOL], metadata2.column_kind[DataKind.BOOL], "Boolean column") - - if DataKind.ID in self.compare_kind and DataKind.ID in metadata1.column_kind and DataKind.ID in metadata2.column_kind: - if index1 in metadata1.column_kind[DataKind.ID] and index2 in metadata2.column_kind[DataKind.ID]: - return self.compare_ids(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) - are_nulls = are_columns_null(metadata1.column_kind[DataKind.ID], metadata2.column_kind[DataKind.ID], "ID column") - - if DataKind.CATEGORICAL in self.compare_kind and DataKind.CATEGORICAL in metadata1.column_kind and DataKind.CATEGORICAL in metadata2.column_kind: - if index1 in metadata1.column_kind[DataKind.CATEGORICAL] and index2 in metadata2.column_kind[DataKind.CATEGORICAL]: - return self.compare_categoricals(metadata1.categorical_metadata[index1], metadata2.categorical_metadata[index2]) - are_nulls = are_columns_null(metadata1.column_kind[DataKind.CATEGORICAL], metadata2.column_kind[DataKind.CATEGORICAL], "Categorical column") - - if DataKind.CONSTANT in self.compare_kind and DataKind.CONSTANT in metadata1.column_kind and DataKind.CONSTANT in metadata2.column_kind: - if index1 in metadata1.column_kind[DataKind.CONSTANT] and index2 in metadata2.column_kind[DataKind.CONSTANT]: - return self.compare_constants(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) - - are_nulls = are_columns_null(metadata1.column_kind[DataKind.CONSTANT], metadata2.column_kind[DataKind.CONSTANT], "Constant column") - - if are_nulls[0]: - return are_nulls[1] - return np.nan - - -class ColumnTypeHandler(SpecificColumnHandler): - - def __numerical_compare1( - self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int - ) -> float: - num_met1 = metadata1.numerical_metadata[index1] - num_met2 = metadata2.numerical_metadata[index2] - if num_met1.same_value_length == num_met2.same_value_length: - score += 2 - if num_met1.min_value == num_met2.min_value: - score += 1 - elif num_met1.min_value == num_met2.min_value + num_met1.range_size / 100 or num_met1.max_value == num_met2.max_value - num_met1.range_size / 100: - score += 0.5 - if num_met1.max_value == num_met2.max_value: - score += 1 - elif num_met1.max_value == num_met2.max_value - num_met1.range_size / 100 or num_met1.max_value == num_met2.max_value + num_met1.range_size / 100: - score += 0.5 - if num_met1.range_size == num_met2.range_size: - score += 2 - return 1 - score / 9 - - def __nonnumerical_compare1( - self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int - ) -> float: - num_met1 = metadata1.nonnumerical_metadata[index1] - num_met2 = metadata2.nonnumerical_metadata[index2] - if num_met1.longest == num_met2.longest or num_met1.longest is num_met2.longest: - score += 2 - if num_met1.shortest == num_met2.shortest or num_met1.shortest is num_met2.shortest: - score += 2 - if num_met1.avg_length == num_met2.avg_length: - score += 2 - elif num_met1.avg_length == num_met2.avg_length + num_met1.avg_length / 100 or num_met1.avg_length == num_met2.avg_length - num_met1.avg_length / 100: - score += 1 - return 1 - score / 9 - - def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: - """ - Compare if two columns have the same type. - :param index2: name of column in metadata2 - :param index1: name of column in metadata1 - :param metadata1: first dataframe metadata - :param metadata2: second dataframe metadata - :return: float number between 0 and 1 (distance) - """ - column1_type = metadata1.get_column_type(index1) - column2_type = metadata2.get_column_type(index2) - score = 3 if column1_type == column2_type else 0 - if index1 in metadata1.numerical_metadata and index2 in metadata2.numerical_metadata: - return self.__numerical_compare1(metadata1, metadata2, index1, index2, score) - - if index1 in metadata1.nonnumerical_metadata and index2 in metadata2.nonnumerical_metadata: - return self.__nonnumerical_compare1(metadata1, metadata2, index1, index2, score) - - if column1_type == column2_type: - return 0 - return 1 - class ComparatorByColumn(Comparator): """ diff --git a/similarity_framework/src/impl/comparator/comparator_by_type.py b/similarity_framework/src/impl/comparator/comparator_by_type.py index c37e0a4..af16931 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_type.py +++ b/similarity_framework/src/impl/comparator/comparator_by_type.py @@ -20,499 +20,6 @@ from similarity_framework.src.models.settings import AnalysisSettings -class CategoricalHandler(HandlerType): - """ - Categorical Handler class - """ - - def __compute_distance(self, dist_matrix: list[list[float]]) -> float: # Hausdorff - """ - Compute distance from similarity matrix - todo maybe switch to hausdorfdist?? - """ - row_mins = [] - column_mins = [] - for row in dist_matrix: - row_mins.append(min(row)) - for column in zip(*dist_matrix): - column_mins.append(min(column)) - return min([max(row_mins), max(column_mins)]) - - def __create_dist_matrix(self, embeddings1: list[Tensor], embeddings2: list[Tensor]) -> list[list[float]]: - """ - creates similarity matrix for embeddings - :param embeddings1: embeddings for first column - :param embeddings2: embeddings for second column - :return: similarity matrix - """ - simil_matrix = [] - for embed1 in embeddings1: - siml_line = [] - for embed2 in embeddings2: - # todo rounding for 3 digits ? ok -> two because of minus 0 - siml_line.append( - round( - 1 - - round( - cosine_sim(embed1, embed2), - 4, - ), - 3, - ) - ) # distance is 1- similarity - simil_matrix.append(siml_line) - return simil_matrix - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: - """ - Compare two categorical columns - the distance is between 0 and 1 - :param metadata1: first table - :param metadata2: second table - :return: dataframe full of numbers between 0 and 1 - """ - result = pd.DataFrame() - name_distance = pd.DataFrame() - for id1, ( - column1, - categorical1, - ) in enumerate(metadata1.categorical_metadata.items()): - for id2, ( - column2, - categorical2, - ) in enumerate(metadata2.categorical_metadata.items()): - simil_matrix = self.__create_dist_matrix( - categorical1.category_embedding, - categorical2.category_embedding, - ) - # count, score = self.__compute_similarity_score(simil_matrix) - dist = self.__compute_distance(simil_matrix) - ratio = get_ratio(categorical1.count_categories, categorical1.count_categories) - result.loc[id1, id2] = dist * ratio - name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2]) - # todo p value or correlation - return concat(result, name_distance) - - -## TODO Kind, Type - - -class CategoricalHandlerSimilar(CategoricalHandler): - """ - Handler for column category - """ - - def __create_sim_matrix(self, embeddings1: list[Tensor], embeddings2: list[Tensor]) -> list[list[float]]: - simil_matrix = [] - for embed1 in embeddings1: - siml_line = [] - for embed2 in embeddings2: - siml_line.append( - round( - cosine_sim(embed1, embed2), - 3, - ) - ) - simil_matrix.append(siml_line) - return simil_matrix - - def __compute_similarity_score(self, similarity_matrix: list[list[float]]) -> tuple[int, float]: # todo test some other methods - # todo use Haufsdorfe distance ? - res = 0.0 - count = 0 - trashold = 0.7 # todo set from outside - for i in similarity_matrix: - if max(i) > trashold: - count += 1 - res += max(i) - return count, res / len(similarity_matrix) * (count / len(similarity_matrix)) - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: - """ - Compare categorical columns, if the columns are similar - :param metadata1: first table - :param metadata2: second table - :return: dataframe full of numbers between 0 and 1 - """ - result = pd.DataFrame() - name_distance = pd.DataFrame() - for id1, (column1, categorical1) in enumerate(metadata1.categorical_metadata.items()): - for id2, (column2, categorical2) in enumerate(metadata2.categorical_metadata.items()): - simil_matrix = self.__create_sim_matrix(categorical1.category_embedding, categorical2.category_embedding) - _, score = self.__compute_similarity_score(simil_matrix) - ratio = get_ratio(categorical1.count_categories, categorical1.count_categories) # todo 1-ratio??? - result.loc[id1, id2] = 1 - (score * ratio) - name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2]) - # todo p value or correlation - return concat(result, name_distance) - - -# class ColumnEmbeddingHandler(HandlerType): -# """ -# Handler for column values embeddings -# """ -# -# def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: -# """ -# Compare embeddings of columns -# :param metadata1: first table -# :param metadata2: second table -# :return: dataframe full of numbers between 0 and 1 -# """ -# result = pd.DataFrame() -# name_distance = pd.DataFrame() -# for id1, ( -# column1, -# embedding1, -# ) in enumerate(metadata1.column_embeddings.items()): -# for id2, ( -# column2, -# embedding2, -# ) in enumerate(metadata2.column_embeddings.items()): -# result.loc[id1, id2] = 1 - cosine_sim(embedding1, embedding2) -# name_distance.loc[id1, id2] = 1 - cosine_sim( -# metadata1.column_embeddings[column1], -# metadata2.column_embeddings[column2], -# ) -# return concat(result, name_distance) - - -# class SizeHandler(HandlerType): -# """ -# Size of table Handler class -# """ -# -# def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: -# """ -# If sizes are the same distance is 0, else distance is 1 - % of max -# :param metadata1: first table -# :param metadata2: second table -# :return: dataframe of size 1x1 fill with distance number (0-1) # todo test -# """ -# max_size = int(max(metadata1.size, metadata2.size)) -# min_size = int(min(metadata1.size, metadata2.size)) -# distance = 1 - (min_size / max_size) -# return pd.DataFrame(index=range(1), columns=range(1)).fillna(distance) -# # todo if this is not working try this We will fill the whole table with this numer, distance function should compute the same number (todo test) - - -# class ColumnExactNamesHandler(HandlerType): -# """ -# Handler for exact column names -# """ -# -# def compare(self, metadata1: Metadata, metadata2: Metadata, *kwargs) -> pd.DataFrame: -# """ -# This is dummy Handler if the names are exactly the same distance is 0 if not distance is 1 -# :param metadata1: first table -# :param metadata2: second table -# :return: dataframe fill by 0 and 1 -# """ -# if metadata1.column_names_clean == {} or metadata2.column_names_clean == {}: -# logger.warning("Warning: column_names_clean is not computed") -# return fill_result(metadata1.column_names_clean, metadata2.column_names_clean) - - -# class ColumnNamesEmbeddingsHandler(HandlerType): -# """ -# Handler for column names embeddings -# """ -# -# def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: -# """ -# Computes cosine distance for each column name embedding -# :param distance_function: - not used -# :param metadata1: first table -# :param metadata2: second table -# :param settings: - not used -# :return: dataframe fill by distances between 0 and 1 -# """ -# if metadata1.column_name_embeddings == {} or metadata2.column_name_embeddings == {}: -# logger.warning("Warning: column name embedding is not computed") -# -# result = pd.DataFrame() -# for idx1, name1 in enumerate(metadata1.column_name_embeddings.values()): -# for idx2, name2 in enumerate(metadata2.column_name_embeddings.values()): -# result.loc[idx1, idx2] = 1 - cosine_sim(name1, name2) -# return result - - -# class IncompleteColumnsHandler(HandlerType):# todo thsi ok -# """ -# Handler for incomplete columns -# """ -# -# def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: -# """ -# Compare if two columns are complete or incomplete, if both have same outcome (True False) -# the distance is 0 otherwise is 1 -# :param metadata1: first table -# :param metadata2: second table -# :return: dataframe full of 1 and 0 -# """ -# return fill_result(metadata1.column_incomplete, metadata2.column_incomplete) ## - - -class KindHandler(HandlerType): - """ - Handler for column kind - """ - - def __init__( - self, distance_function: DistanceFunction = HausdorffDistanceMin(), compare_kind: list[DataKind] = None, weight: dict[DataKind.BOOL, int] = None - ): - super().__init__(weight=1) - self.distance_function = distance_function - if compare_kind is None: - self.compare_kind = [ - DataKind.BOOL, - DataKind.ID, - DataKind.CATEGORICAL, - DataKind.CONSTANT, - ] - else: - self.compare_kind = compare_kind - if weight is None: - self.kind_weight: dict = {DataKind.BOOL: 1, DataKind.ID: 1, DataKind.CATEGORICAL: 1, DataKind.CONSTANT: 1} - else: - self.kind_weight = weight - - def compute_result(self, distance_table: pd.DataFrame, distance_function: DistanceFunction, settings: set[Settings], weight: int): - """ - Compute result from distance table - """ - tmp = pow(distance_function.compute(distance_table), 2) * weight - if Settings.NO_RATIO not in settings: - tmp = tmp * get_ratio( - distance_table.shape[0], - distance_table.shape[1], - ) - return tmp - - def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: - """ - Creates table of distances between embeddings for each row and computes mean - of row and column minimums then pick max. - :param embeddings1: values for column1 - :param embeddings2: values for column2 - :return: float from 0 to 1 - """ - res = [] - row_mins = [] - id1 = 0 - for embed1 in embeddings1: - results = [] - for embed2 in embeddings2: - result = 1 - cosine_sim(embed1, embed2) - results.append(result) - res.append(results) - row_mins.append(min(results)) - id1 += 1 - column_mins = [] - for_iter = pd.DataFrame(data=res) - for _, column in for_iter.items(): - column_mins.append(min(column)) - return max([mean(column_mins), mean(row_mins)]) # todo vysvetlit v textu - - def __are_columns_null(self, column1: set, column2: set, message: str) -> tuple[bool, pd.DataFrame]: - """ - Check if columns are empty - :param column1: - :param column2: - :param message: - :return: tuple of bool and dataframe, if columns are empty return True - """ - if len(column1) == 0 and len(column2) == 0: - logger.warning(f"{message} is not present in the dataframe.") - return True, pd.DataFrame([0]) - if (len(column1) == 0) != (len(column2) == 0): - logger.warning(f"{message} is not present in one of the dataframes.") - return True, pd.DataFrame([1]) - return False, pd.DataFrame() - - def compare_constants(self, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: - """ - Compare all constant columns. Compare if they contain nulls. - Compare embeddings of values. - Make an average of these values. - :param metadata1: for column1 - :param metadata2: for column2 - :return: matrix containing float numbers in range <0, 1> - """ - value_re = pd.DataFrame() - nulls_re = pd.DataFrame() - are_nulls = self.__are_columns_null(metadata1.column_kind[DataKind.CONSTANT], metadata2.column_kind[DataKind.CONSTANT], "Constant metadata") - if are_nulls[0]: - return are_nulls[1] - for column1 in metadata1.column_kind[DataKind.CONSTANT]: - for column2 in metadata2.column_kind[DataKind.CONSTANT]: - # Extract metadata for columns - meta1 = metadata1.kind_metadata[column1] - meta2 = metadata2.kind_metadata[column2] - - if meta1.value_embeddings is None or meta2.value_embeddings is None: - # 0 distance if values are the same otherwise 1 - value_re.loc[column1, column2] = int(meta1.value != meta2.value) - else: - value_re.loc[column1, column2] = 1 - cosine_sim( - meta1.value_embeddings[0], # todo 0 nebo 1 - meta2.value_embeddings[0], - ) - - # 0 distance if values are the same otherwise 1 - nulls_re.loc[column1, column2] = int(meta1.nulls != meta2.nulls) - - # if nulls are equal and exist - if nulls_re.loc[column1, column2] == 0 and meta1.nulls: - ratio1 = meta1.distribution[0] / meta1.distribution[1] - ratio2 = meta2.distribution[0] / meta2.distribution[1] - nulls_re.loc[column1, column2] = abs(ratio1 - ratio2) # compute difference between distribution - - return concat(nulls_re, value_re) - - def compare_ids(self, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: - """ - Compare all id columns. Compare if they contain nulls. - Compare embeddings of values. - Compare ratio of max length. - Make an average of these values. - :return: matrix containing float numbers in range <0, 1> - """ - nulls_re = pd.DataFrame() - value_long_re = pd.DataFrame() - value_short_re = pd.DataFrame() - ratio_max_re = pd.DataFrame() - are_nulls = self.__are_columns_null(metadata1.column_kind[DataKind.ID], metadata2.column_kind[DataKind.ID], "ID metadata") - if are_nulls[0]: - return are_nulls[1] - for column1 in metadata1.column_kind[DataKind.ID]: - for column2 in metadata2.column_kind[DataKind.ID]: - for value_re, attribute in [(value_long_re, "longest"), (value_short_re, "shortest")]: - embeddings1 = getattr(metadata1.kind_metadata[column1], f"{attribute}_embeddings") - embeddings2 = getattr(metadata2.kind_metadata[column2], f"{attribute}_embeddings") - attribute1 = getattr(metadata1.kind_metadata[column1], attribute) - attribute2 = getattr(metadata2.kind_metadata[column2], attribute) - - if embeddings1 is None or embeddings2 is None: - value_re.loc[column1, column2] = 0 if attribute1 == attribute2 else 1 - else: - value_re.loc[column1, column2] = 1 - cosine_sim( - embeddings1, - embeddings2, - ) - nulls_re.loc[column1, column2] = 0 if metadata1.kind_metadata[column1].nulls == metadata2.kind_metadata[column2].nulls else 1 - ratio_max_re.loc[column1, column2] = abs(metadata1.kind_metadata[column1].ratio_max_length - metadata2.kind_metadata[column2].ratio_max_length) - - return concat( - value_short_re, - value_long_re, - ratio_max_re, - nulls_re, - ) - - def compare_bools(self, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: - """ - Compare all boolean columns. Compare if they have the same distribution of True and False values. - Compare if they contain nulls. - Compare embeddings of values. - Make an average of these values. - :param metadata1: for column1 - :param metadata2: for column2 - :return: matrix containing float numbers in range <0, 1> - """ - value_re = pd.DataFrame() - distr_re = pd.DataFrame() - nulls_re = pd.DataFrame() - are_nulls = self.__are_columns_null(metadata1.column_kind[DataKind.BOOL], metadata2.column_kind[DataKind.BOOL], "Boolean metadata") - if are_nulls[0]: - return are_nulls[1] - for column1 in metadata1.column_kind[DataKind.BOOL]: - for column2 in metadata2.column_kind[DataKind.BOOL]: - nulls_re.loc[column1, column2] = 0 if metadata1.kind_metadata[column1].nulls == metadata2.kind_metadata[column2].nulls else 1 - distr_re.loc[column1, column2] = abs( - metadata1.kind_metadata[column1].distribution[0] / metadata1.kind_metadata[column1].distribution[1] - - metadata2.kind_metadata[column2].distribution[0] / metadata2.kind_metadata[column2].distribution[1] - ) - if metadata1.kind_metadata[column1].value_embeddings is None or metadata2.kind_metadata[column2].value_embeddings is None: - value_re.loc[column1, column2] = 0 - else: - value_re.loc[column1, column2] = self.compute_embeddings_distance( - metadata1.kind_metadata[column1].value_embeddings, metadata2.kind_metadata[column2].value_embeddings - ) - return concat(value_re, distr_re, nulls_re) - - def compare_categorical(self, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: - """ - Compare all categorical columns. Compare if they contain nulls. - Compare embeddings of values. - Make an average of these values. - :param metadata1: for column1 - :param metadata2: for column2 - :return: matrix containing float numbers in range <0, 1> - """ - value_re = pd.DataFrame() - count_re = pd.DataFrame() - are_nulls = self.__are_columns_null(metadata1.column_kind[DataKind.CATEGORICAL], metadata2.column_kind[DataKind.CATEGORICAL], "Categorical metadata") - if are_nulls[0]: - return are_nulls[1] - for column1 in metadata1.column_kind[DataKind.CATEGORICAL]: - for column2 in metadata2.column_kind[DataKind.CATEGORICAL]: - value_re.loc[column1, column2] = self.compute_embeddings_distance( - metadata1.categorical_metadata[column1].category_embedding, metadata2.categorical_metadata[column2].category_embedding - ) - count1 = metadata1.categorical_metadata[column1].count_categories - count2 = metadata2.categorical_metadata[column2].count_categories - count_re.loc[column1, column2] = count1 / count2 if count1 < count2 else count2 / count1 - # todo compare categories_with_count for metadata1 and metadata2 - # firstly normalize dictionary categories_with_count then - # compare the difference between the two dictionaries - return concat(value_re, count_re) - - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: - """ - Compare kind columns - :param metadata1: first table - :param metadata2: second table - :return: dataframe full of numbers between 0 and 1 - """ - result = 0 - if DataKind.BOOL in self.compare_kind: - bools = self.compare_bools(metadata1, metadata2) - result += self.compute_result( - bools, - self.distance_function, - self.settings, - self.kind_weight[DataKind.BOOL], - ) - if DataKind.CONSTANT in self.compare_kind: - constants = self.compare_constants(metadata1, metadata2) - result += self.compute_result( - constants, - self.distance_function, - self.settings, - self.kind_weight[DataKind.CONSTANT], - ) - if DataKind.ID in self.compare_kind: - ids = self.compare_ids(metadata1, metadata2) - result += self.compute_result( - ids, - self.distance_function, - self.settings, - self.kind_weight[DataKind.ID], - ) - if DataKind.CATEGORICAL in self.compare_kind: - categorical = self.compare_categorical(metadata1, metadata2) - result += self.compute_result( - categorical, - self.distance_function, - self.settings, - self.kind_weight[DataKind.CATEGORICAL], - ) - return pd.DataFrame([result]) - - - - class ComparatorByType(Comparator): """ Comparator for comparing two tables by type diff --git a/similarity_framework/src/impl/comparator/handlers.py b/similarity_framework/src/impl/comparator/handlers.py new file mode 100644 index 0000000..7b54dd8 --- /dev/null +++ b/similarity_framework/src/impl/comparator/handlers.py @@ -0,0 +1,815 @@ +import logging +from abc import ABC, abstractmethod +from statistics import mean + +import numpy as np +import pandas as pd +from pyarrow import Tensor + +from logging_ import logger +from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin +from similarity_framework.src.impl.comparator.utils import cosine_sim, are_columns_null, get_ratio, concat +from similarity_framework.src.interfaces.common import DistanceFunction +from similarity_framework.src.interfaces.comparator.handler import HandlerType +from similarity_framework.src.models.metadata import Metadata, KindMetadata, CategoricalMetadata +from similarity_framework.src.models.settings import Settings +from similarity_framework.src.models.types_ import DataKind + + +class BasicHandler(HandlerType): + + def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: + if "index1" not in kwargs or "index2" not in kwargs: + raise RuntimeError(f"Handler didnt have sufficient arguments - index1 and index2 - {kwargs}") + return self._inner_compare(metadata1, metadata2, kwargs["index1"], kwargs["index2"]) + + @abstractmethod + def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> pd.DataFrame: + pass + + +class TableHandler(HandlerType, ABC): + """ + Abstract class for table handlers it should compare features of whole table + """ + + +class GeneralColumnHandler(BasicHandler, ABC): + """ + Handler for simple comparison + """ + + +class SpecificColumnHandler(BasicHandler, ABC): + """ + Handler for advanced comparison + """ + + +class SizeHandler(TableHandler): + """ + Handler of size of two tables + """ + + def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> float: + """ + Compare the size of the two dataframes. If sizes are the same distance is 0, else distance is 1 - % of max size. + :param index1: in this case is not used + :param index2: in this case it not used + :param metadata1: first dataframe metadata + :param metadata2: second dataframe metadata + :return: float number in range <0, 1> + """ + max_size = int(max(metadata1.size, metadata2.size)) + min_size = int(min(metadata1.size, metadata2.size)) + return 1 - (min_size / max_size) + + +class IncompleteColumnsHandler(GeneralColumnHandler): + """ + Handler for incomplete columns + """ + + def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: + """ + Compare if two columns are complete or incomplete. If both are complete, + or both are incomplete distance is 0, else distance is 1 + :param index2: name or id of column in metadata2 + :param index1: name or id of column in metadata1 + :param metadata1: first dataframe metadata + :param metadata2: second dataframe metadata + :return: float number 0 or 1 + """ + return 0 if metadata1.column_incomplete[index1] == metadata2.column_incomplete[index2] else 1 + + +class ColumnExactNamesHandler(GeneralColumnHandler): + """ + Handler for exact column names + """ + + def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: + """ + Compare if two columns have the same name. If both have the same name distance is 0, else distance is 1. + :param index2: name or id of column in metadata2 + :param index1: name or id of column in metadata1 + :param metadata1: first dataframe metadata + :param metadata2: second dataframe metadata + :return: float number 0 or 1 + """ + if metadata1.column_names_clean == {} or metadata2.column_names_clean == {}: + logger.warning("Warning: column_names_clean is not computed") + return 0 if metadata1.column_names_clean[index1] == metadata2.column_names_clean[index2] else 1 + + +class ColumnNamesEmbeddingsHandler(GeneralColumnHandler): + """ + Handler for column names embeddings + """ + + def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: + """ + Compare if two columns have similar name. Computes cosine distance for embeddings + :param index2: name or id of column in metadata2 + :param index1: name or id of column in metadata1 + :param metadata1: first dataframe metadata + :param metadata2: second dataframe metadata + :return: float number in range <0, 1> 0 exactly the same 1 completely different + """ + if metadata1.column_name_embeddings == {} or metadata2.column_name_embeddings == {}: + logging.warning("Warning: column name embedding is not computed") + return 1 + return 1 - cosine_sim( + metadata1.column_name_embeddings[index1], + metadata2.column_name_embeddings[index2], + ) + + +class ColumnEmbeddingsHandler(GeneralColumnHandler): + """ + Handler for column values embeddings + """ + + def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: + """ + Compare embeddings for two columns. Computes cosine distance for embeddings. + :param index2: name or id of column in metadata2 + :param index1: name or id of column in metadata1 + :param metadata1: first dataframe metadata + :param metadata2: second dataframe metadata + :return: float number in range <0, 1> 0 exactly the same 1 completely different + """ + if ( + metadata1.column_embeddings == {} + or metadata2.column_embeddings == {} + or index1 not in metadata1.column_embeddings + or index2 not in metadata2.column_embeddings + ): + logger.debug( + f"column embedding is not computed - [{metadata1.column_embeddings == {}} - {metadata2.column_embeddings == {}}] {index1 if index1 not in metadata1.column_embeddings else index2}" + ) + return np.nan + return 1 - cosine_sim( + metadata1.column_embeddings[index1], + metadata2.column_embeddings[index2], + ) + + +class ColumnKindHandler(SpecificColumnHandler): + """ + Handler for column kind + """ + + def __init__(self, compare_kind=None, weight=1): + """ + Constructor for ColumnKindHandler, sets which kinds should be compared and weight for each kind + """ + super().__init__(weight=weight) + if compare_kind is None: + self.compare_kind = [ + DataKind.BOOL, + DataKind.ID, + DataKind.CATEGORICAL, + DataKind.CONSTANT, + ] + else: + self.compare_kind = compare_kind + if weight is None: + self.kind_weight: dict = {DataKind.BOOL: 1, DataKind.ID: 1, DataKind.CATEGORICAL: 1, DataKind.CONSTANT: 1} + else: + self.kind_weight = weight + + def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # todo add type + """ + Creates table of distances between embeddings for each row and computes mean + of row and column minimums then pick max. + :param embeddings1: values for column1 + :param embeddings2: values for column2 + :return: float from 0 to 1 + """ + # alternative version + # res = pd.DataFrame() + # row_mins = [] + # for id1, embed1 in enumerate(embeddings1): + # for id2, embed2 in enumerate(embeddings2): + # res.loc[id1, id2] = 1 - cosine_sim(embed1, embed2) + # row_mins.append(res.loc[id1].min()) + # column_mins = [] + # for _, column in res.items(): + # column_mins.append(min(column)) + # return max([mean(row_mins), mean(column_mins)]) + + similarity_matrix = [[1 - cosine_sim(embed1, embed2) for embed2 in embeddings2] for embed1 in embeddings1] + res = pd.DataFrame(similarity_matrix) + row_mins = res.min(axis=1).tolist() + column_mins = res.min(axis=0).tolist() + return max(mean(row_mins), mean(column_mins)) + # todo vysvetlit v textu + + def compare_bools( + self, + metadata1: KindMetadata, + metadata2: KindMetadata, + ) -> float: + """ + Compare two boolean columns. Compare if they have the same distribution of True and False values. + Compare if they contain nulls. + Compare embeddings of values. + Make an average of these values. + :param metadata1: for column1 + :param metadata2: for column2 + :return: float number in range <0, 1> + """ + nulls = 0 if metadata1.nulls == metadata2.nulls else 1 + dist1 = metadata1.distribution[0] / metadata1.distribution[1] if metadata1.distribution[1] > metadata1.distribution[0] else metadata1.distribution[1] / metadata1.distribution[0] + dist2 = metadata2.distribution[0] / metadata2.distribution[1] if metadata2.distribution[1] > metadata2.distribution[0] else metadata2.distribution[1] / metadata2.distribution[0] + distr = abs(dist1 - dist2) + if metadata1.value_embeddings is None or metadata2.value_embeddings is None: + return (nulls + distr) / 2 + return ( + nulls + + distr + + self.compute_embeddings_distance( + metadata1.value_embeddings, + metadata2.value_embeddings, + ) + ) / 3 + + def compare_categoricals( + self, + metadata1: CategoricalMetadata, + metadata2: CategoricalMetadata, + ) -> float: + """ + Compare two categorical columns. Compare if they contain nulls. + Compare embeddings of values. + Make an average of these values. + :param metadata1: for column1 + :param metadata2: for column2 + :return: float number in range <0, 1> + """ + value_re = self.compute_embeddings_distance( + metadata1.category_embedding, + metadata2.category_embedding, + ) + count1 = metadata1.count_categories + count2 = metadata2.count_categories + count_re = 1 - count1 / count2 if count1 < count2 else 1 - count2 / count1 + # todo compare categories_with_count for metadata1 and metadata2 + # firstly normalize dictionary categories_with_count then + # compare the difference between the two dictionaries + return (value_re + count_re) / 2 + + def compare_constants( + self, + metadata1: KindMetadata, + metadata2: KindMetadata, + ) -> float: + """ + Compare two constant columns. Compare if they contain nulls. + Compare embeddings of values. + Make an average of these values. + :param metadata1: for column1 + :param metadata2: for column2 + :return: float number in range <0, 1> + """ + nulls = 0 if metadata1.nulls == metadata2.nulls else 1 + if metadata1.value_embeddings is None or metadata2.value_embeddings is None: + value: float = 0 if metadata1.value == metadata2.value else 1 + else: + value = 1 - cosine_sim( + metadata1.value_embeddings[0], + metadata2.value_embeddings[0], + ) + # if nulls are equal and exist + if nulls == 0 and metadata1.nulls: + ratio1 = metadata1.distribution[0] / metadata1.distribution[1] + ratio2 = metadata2.distribution[0] / metadata2.distribution[1] + nulls = abs(ratio1 - ratio2) # compute difference between distribution + return (nulls + value) / 2 + + def compare_ids( + self, + metadata1: KindMetadata, + metadata2: KindMetadata, + ) -> float: + """ + Compare two id columns. Compare if they contain nulls. + Compare embeddings of values. + Compare ratio of max length. + Make an average of these values. + :return: float number in range <0, 1> + """ + embeddings1_longest = metadata1.longest_embeddings + embeddings2_longest = metadata2.longest_embeddings + embeddings1_shortest = metadata1.shortest_embeddings + embeddings2_shortest = metadata2.shortest_embeddings + + if embeddings1_longest is not None and embeddings2_longest is not None: + value_long_re = 1 - cosine_sim( + embeddings1_longest, + embeddings2_longest, + ) + else: + value_long_re = 0 if metadata1.longest == metadata2.longest else 1 + if embeddings1_shortest is not None and embeddings2_shortest is not None: + value_short_re = 1 - cosine_sim( + embeddings1_shortest, + embeddings2_shortest, + ) + else: + value_short_re = 0 if metadata1.shortest == metadata2.shortest else 1 + + nulls_re = 0 if metadata1.nulls == metadata2.nulls else 1 + ratio_max_re = abs(metadata1.ratio_max_length - metadata2.ratio_max_length) + return (value_short_re + value_long_re + nulls_re + ratio_max_re) / 4 + + def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: + """ + Compare if two columns have the same kind. If both have the same kind distance is 0, else distance is 1. + :param index2: name or id of column in metadata2 + :param index1: name or id of column in metadata1 + :param metadata1: first dataframe metadata + :param metadata2: second dataframe metadata + :return: float number 0 or 1 + + data_kinds = [DataKind.BOOL, DataKind.ID, DataKind.CATEGORICAL, DataKind.CONSTANT] + compare_methods = [self.compare_bools, self.compare_ids, self.compare_categoricals, self.compare_constants] + + for kind, method in zip(data_kinds, compare_methods): + if kind in self.compare_kind: + if index1 in metadata1.column_kind[kind] and index2 in metadata2.column_kind[kind]: + return method() + if index1 in metadata1.column_kind[kind] or index2 in metadata2.column_kind[kind]: + return 1 + return np.nan + + """ + are_nulls = (False, 0.0) + if DataKind.BOOL in self.compare_kind and DataKind.BOOL in metadata1.column_kind and DataKind.BOOL in metadata2.column_kind: + if index1 in metadata1.column_kind[DataKind.BOOL] and index2 in metadata2.column_kind[DataKind.BOOL]: + return self.compare_bools(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) + are_nulls = are_columns_null(metadata1.column_kind[DataKind.BOOL], metadata2.column_kind[DataKind.BOOL], "Boolean column") + + if DataKind.ID in self.compare_kind and DataKind.ID in metadata1.column_kind and DataKind.ID in metadata2.column_kind: + if index1 in metadata1.column_kind[DataKind.ID] and index2 in metadata2.column_kind[DataKind.ID]: + return self.compare_ids(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) + are_nulls = are_columns_null(metadata1.column_kind[DataKind.ID], metadata2.column_kind[DataKind.ID], "ID column") + + if DataKind.CATEGORICAL in self.compare_kind and DataKind.CATEGORICAL in metadata1.column_kind and DataKind.CATEGORICAL in metadata2.column_kind: + if index1 in metadata1.column_kind[DataKind.CATEGORICAL] and index2 in metadata2.column_kind[DataKind.CATEGORICAL]: + return self.compare_categoricals(metadata1.categorical_metadata[index1], metadata2.categorical_metadata[index2]) + are_nulls = are_columns_null(metadata1.column_kind[DataKind.CATEGORICAL], metadata2.column_kind[DataKind.CATEGORICAL], "Categorical column") + + if DataKind.CONSTANT in self.compare_kind and DataKind.CONSTANT in metadata1.column_kind and DataKind.CONSTANT in metadata2.column_kind: + if index1 in metadata1.column_kind[DataKind.CONSTANT] and index2 in metadata2.column_kind[DataKind.CONSTANT]: + return self.compare_constants(metadata1.kind_metadata[index1], metadata2.kind_metadata[index2]) + + are_nulls = are_columns_null(metadata1.column_kind[DataKind.CONSTANT], metadata2.column_kind[DataKind.CONSTANT], "Constant column") + + if are_nulls[0]: + return are_nulls[1] + return np.nan + + +class ColumnTypeHandler(SpecificColumnHandler): + + def __numerical_compare1( + self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int + ) -> float: + num_met1 = metadata1.numerical_metadata[index1] + num_met2 = metadata2.numerical_metadata[index2] + if num_met1.same_value_length == num_met2.same_value_length: + score += 2 + if num_met1.min_value == num_met2.min_value: + score += 1 + elif num_met1.min_value == num_met2.min_value + num_met1.range_size / 100 or num_met1.max_value == num_met2.max_value - num_met1.range_size / 100: + score += 0.5 + if num_met1.max_value == num_met2.max_value: + score += 1 + elif num_met1.max_value == num_met2.max_value - num_met1.range_size / 100 or num_met1.max_value == num_met2.max_value + num_met1.range_size / 100: + score += 0.5 + if num_met1.range_size == num_met2.range_size: + score += 2 + return 1 - score / 9 + + def __nonnumerical_compare1( + self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int + ) -> float: + num_met1 = metadata1.nonnumerical_metadata[index1] + num_met2 = metadata2.nonnumerical_metadata[index2] + if num_met1.longest == num_met2.longest or num_met1.longest is num_met2.longest: + score += 2 + if num_met1.shortest == num_met2.shortest or num_met1.shortest is num_met2.shortest: + score += 2 + if num_met1.avg_length == num_met2.avg_length: + score += 2 + elif num_met1.avg_length == num_met2.avg_length + num_met1.avg_length / 100 or num_met1.avg_length == num_met2.avg_length - num_met1.avg_length / 100: + score += 1 + return 1 - score / 9 + + def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str) -> float: + """ + Compare if two columns have the same type. + :param index2: name of column in metadata2 + :param index1: name of column in metadata1 + :param metadata1: first dataframe metadata + :param metadata2: second dataframe metadata + :return: float number between 0 and 1 (distance) + """ + column1_type = metadata1.get_column_type(index1) + column2_type = metadata2.get_column_type(index2) + score = 3 if column1_type == column2_type else 0 + if index1 in metadata1.numerical_metadata and index2 in metadata2.numerical_metadata: + return self.__numerical_compare1(metadata1, metadata2, index1, index2, score) + + if index1 in metadata1.nonnumerical_metadata and index2 in metadata2.nonnumerical_metadata: + return self.__nonnumerical_compare1(metadata1, metadata2, index1, index2, score) + + if column1_type == column2_type: + return 0 + return 1 + + + + + +class CategoricalHandler(HandlerType): + """ + Categorical Handler class + """ + + def __compute_distance(self, dist_matrix: list[list[float]]) -> float: # Hausdorff + """ + Compute distance from similarity matrix + todo maybe switch to hausdorfdist?? + """ + row_mins = [] + column_mins = [] + for row in dist_matrix: + row_mins.append(min(row)) + for column in zip(*dist_matrix): + column_mins.append(min(column)) + return min([max(row_mins), max(column_mins)]) + + def __create_dist_matrix(self, embeddings1: list[Tensor], embeddings2: list[Tensor]) -> list[list[float]]: + """ + creates similarity matrix for embeddings + :param embeddings1: embeddings for first column + :param embeddings2: embeddings for second column + :return: similarity matrix + """ + simil_matrix = [] + for embed1 in embeddings1: + siml_line = [] + for embed2 in embeddings2: + # todo rounding for 3 digits ? ok -> two because of minus 0 + siml_line.append( + round( + 1 + - round( + cosine_sim(embed1, embed2), + 4, + ), + 3, + ) + ) # distance is 1- similarity + simil_matrix.append(siml_line) + return simil_matrix + + def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: + """ + Compare two categorical columns + the distance is between 0 and 1 + :param metadata1: first table + :param metadata2: second table + :return: dataframe full of numbers between 0 and 1 + """ + result = pd.DataFrame() + name_distance = pd.DataFrame() + for id1, ( + column1, + categorical1, + ) in enumerate(metadata1.categorical_metadata.items()): + for id2, ( + column2, + categorical2, + ) in enumerate(metadata2.categorical_metadata.items()): + simil_matrix = self.__create_dist_matrix( + categorical1.category_embedding, + categorical2.category_embedding, + ) + # count, score = self.__compute_similarity_score(simil_matrix) + dist = self.__compute_distance(simil_matrix) + ratio = get_ratio(categorical1.count_categories, categorical1.count_categories) + result.loc[id1, id2] = dist * ratio + name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2]) + # todo p value or correlation + return concat(result, name_distance) + +class CategoricalHandlerSimilar(CategoricalHandler): + """ + Handler for column category + """ + + def __create_sim_matrix(self, embeddings1: list[Tensor], embeddings2: list[Tensor]) -> list[list[float]]: + simil_matrix = [] + for embed1 in embeddings1: + siml_line = [] + for embed2 in embeddings2: + siml_line.append( + round( + cosine_sim(embed1, embed2), + 3, + ) + ) + simil_matrix.append(siml_line) + return simil_matrix + + def __compute_similarity_score(self, similarity_matrix: list[list[float]]) -> tuple[int, float]: # todo test some other methods + # todo use Haufsdorfe distance ? + res = 0.0 + count = 0 + trashold = 0.7 # todo set from outside + for i in similarity_matrix: + if max(i) > trashold: + count += 1 + res += max(i) + return count, res / len(similarity_matrix) * (count / len(similarity_matrix)) + + def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: + """ + Compare categorical columns, if the columns are similar + :param metadata1: first table + :param metadata2: second table + :return: dataframe full of numbers between 0 and 1 + """ + result = pd.DataFrame() + name_distance = pd.DataFrame() + for id1, (column1, categorical1) in enumerate(metadata1.categorical_metadata.items()): + for id2, (column2, categorical2) in enumerate(metadata2.categorical_metadata.items()): + simil_matrix = self.__create_sim_matrix(categorical1.category_embedding, categorical2.category_embedding) + _, score = self.__compute_similarity_score(simil_matrix) + ratio = get_ratio(categorical1.count_categories, categorical1.count_categories) # todo 1-ratio??? + result.loc[id1, id2] = 1 - (score * ratio) + name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2]) + # todo p value or correlation + return concat(result, name_distance) + + +class KindHandlerOldByType(HandlerType): + """ + Handler for column kind + """ + + def __init__( + self, distance_function: DistanceFunction = HausdorffDistanceMin(), compare_kind: list[DataKind] = None, weight: dict[DataKind.BOOL, int] = None + ): + super().__init__(weight=1) + self.distance_function = distance_function + if compare_kind is None: + self.compare_kind = [ + DataKind.BOOL, + DataKind.ID, + DataKind.CATEGORICAL, + DataKind.CONSTANT, + ] + else: + self.compare_kind = compare_kind + if weight is None: + self.kind_weight: dict = {DataKind.BOOL: 1, DataKind.ID: 1, DataKind.CATEGORICAL: 1, DataKind.CONSTANT: 1} + else: + self.kind_weight = weight + + def compute_result(self, distance_table: pd.DataFrame, distance_function: DistanceFunction, settings: set[Settings], weight: int): + """ + Compute result from distance table + """ + tmp = pow(distance_function.compute(distance_table), 2) * weight + if Settings.NO_RATIO not in settings: + tmp = tmp * get_ratio( + distance_table.shape[0], + distance_table.shape[1], + ) + return tmp + + def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: + """ + Creates table of distances between embeddings for each row and computes mean + of row and column minimums then pick max. + :param embeddings1: values for column1 + :param embeddings2: values for column2 + :return: float from 0 to 1 + """ + res = [] + row_mins = [] + id1 = 0 + for embed1 in embeddings1: + results = [] + for embed2 in embeddings2: + result = 1 - cosine_sim(embed1, embed2) + results.append(result) + res.append(results) + row_mins.append(min(results)) + id1 += 1 + column_mins = [] + for_iter = pd.DataFrame(data=res) + for _, column in for_iter.items(): + column_mins.append(min(column)) + return max([mean(column_mins), mean(row_mins)]) # todo vysvetlit v textu + + def __are_columns_null(self, column1: set, column2: set, message: str) -> tuple[bool, pd.DataFrame]: + """ + Check if columns are empty + :param column1: + :param column2: + :param message: + :return: tuple of bool and dataframe, if columns are empty return True + """ + if len(column1) == 0 and len(column2) == 0: + logger.warning(f"{message} is not present in the dataframe.") + return True, pd.DataFrame([0]) + if (len(column1) == 0) != (len(column2) == 0): + logger.warning(f"{message} is not present in one of the dataframes.") + return True, pd.DataFrame([1]) + return False, pd.DataFrame() + + def compare_constants(self, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: + """ + Compare all constant columns. Compare if they contain nulls. + Compare embeddings of values. + Make an average of these values. + :param metadata1: for column1 + :param metadata2: for column2 + :return: matrix containing float numbers in range <0, 1> + """ + value_re = pd.DataFrame() + nulls_re = pd.DataFrame() + are_nulls = self.__are_columns_null(metadata1.column_kind[DataKind.CONSTANT], metadata2.column_kind[DataKind.CONSTANT], "Constant metadata") + if are_nulls[0]: + return are_nulls[1] + for column1 in metadata1.column_kind[DataKind.CONSTANT]: + for column2 in metadata2.column_kind[DataKind.CONSTANT]: + # Extract metadata for columns + meta1 = metadata1.kind_metadata[column1] + meta2 = metadata2.kind_metadata[column2] + + if meta1.value_embeddings is None or meta2.value_embeddings is None: + # 0 distance if values are the same otherwise 1 + value_re.loc[column1, column2] = int(meta1.value != meta2.value) + else: + value_re.loc[column1, column2] = 1 - cosine_sim( + meta1.value_embeddings[0], # todo 0 nebo 1 + meta2.value_embeddings[0], + ) + + # 0 distance if values are the same otherwise 1 + nulls_re.loc[column1, column2] = int(meta1.nulls != meta2.nulls) + + # if nulls are equal and exist + if nulls_re.loc[column1, column2] == 0 and meta1.nulls: + ratio1 = meta1.distribution[0] / meta1.distribution[1] + ratio2 = meta2.distribution[0] / meta2.distribution[1] + nulls_re.loc[column1, column2] = abs(ratio1 - ratio2) # compute difference between distribution + + return concat(nulls_re, value_re) + + def compare_ids(self, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: + """ + Compare all id columns. Compare if they contain nulls. + Compare embeddings of values. + Compare ratio of max length. + Make an average of these values. + :return: matrix containing float numbers in range <0, 1> + """ + nulls_re = pd.DataFrame() + value_long_re = pd.DataFrame() + value_short_re = pd.DataFrame() + ratio_max_re = pd.DataFrame() + are_nulls = self.__are_columns_null(metadata1.column_kind[DataKind.ID], metadata2.column_kind[DataKind.ID], "ID metadata") + if are_nulls[0]: + return are_nulls[1] + for column1 in metadata1.column_kind[DataKind.ID]: + for column2 in metadata2.column_kind[DataKind.ID]: + for value_re, attribute in [(value_long_re, "longest"), (value_short_re, "shortest")]: + embeddings1 = getattr(metadata1.kind_metadata[column1], f"{attribute}_embeddings") + embeddings2 = getattr(metadata2.kind_metadata[column2], f"{attribute}_embeddings") + attribute1 = getattr(metadata1.kind_metadata[column1], attribute) + attribute2 = getattr(metadata2.kind_metadata[column2], attribute) + + if embeddings1 is None or embeddings2 is None: + value_re.loc[column1, column2] = 0 if attribute1 == attribute2 else 1 + else: + value_re.loc[column1, column2] = 1 - cosine_sim( + embeddings1, + embeddings2, + ) + nulls_re.loc[column1, column2] = 0 if metadata1.kind_metadata[column1].nulls == metadata2.kind_metadata[column2].nulls else 1 + ratio_max_re.loc[column1, column2] = abs(metadata1.kind_metadata[column1].ratio_max_length - metadata2.kind_metadata[column2].ratio_max_length) + + return concat( + value_short_re, + value_long_re, + ratio_max_re, + nulls_re, + ) + + def compare_bools(self, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: + """ + Compare all boolean columns. Compare if they have the same distribution of True and False values. + Compare if they contain nulls. + Compare embeddings of values. + Make an average of these values. + :param metadata1: for column1 + :param metadata2: for column2 + :return: matrix containing float numbers in range <0, 1> + """ + value_re = pd.DataFrame() + distr_re = pd.DataFrame() + nulls_re = pd.DataFrame() + are_nulls = self.__are_columns_null(metadata1.column_kind[DataKind.BOOL], metadata2.column_kind[DataKind.BOOL], "Boolean metadata") + if are_nulls[0]: + return are_nulls[1] + for column1 in metadata1.column_kind[DataKind.BOOL]: + for column2 in metadata2.column_kind[DataKind.BOOL]: + nulls_re.loc[column1, column2] = 0 if metadata1.kind_metadata[column1].nulls == metadata2.kind_metadata[column2].nulls else 1 + distr_re.loc[column1, column2] = abs( + metadata1.kind_metadata[column1].distribution[0] / metadata1.kind_metadata[column1].distribution[1] + - metadata2.kind_metadata[column2].distribution[0] / metadata2.kind_metadata[column2].distribution[1] + ) + if metadata1.kind_metadata[column1].value_embeddings is None or metadata2.kind_metadata[column2].value_embeddings is None: + value_re.loc[column1, column2] = 0 + else: + value_re.loc[column1, column2] = self.compute_embeddings_distance( + metadata1.kind_metadata[column1].value_embeddings, metadata2.kind_metadata[column2].value_embeddings + ) + return concat(value_re, distr_re, nulls_re) + + def compare_categorical(self, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: + """ + Compare all categorical columns. Compare if they contain nulls. + Compare embeddings of values. + Make an average of these values. + :param metadata1: for column1 + :param metadata2: for column2 + :return: matrix containing float numbers in range <0, 1> + """ + value_re = pd.DataFrame() + count_re = pd.DataFrame() + are_nulls = self.__are_columns_null(metadata1.column_kind[DataKind.CATEGORICAL], metadata2.column_kind[DataKind.CATEGORICAL], "Categorical metadata") + if are_nulls[0]: + return are_nulls[1] + for column1 in metadata1.column_kind[DataKind.CATEGORICAL]: + for column2 in metadata2.column_kind[DataKind.CATEGORICAL]: + value_re.loc[column1, column2] = self.compute_embeddings_distance( + metadata1.categorical_metadata[column1].category_embedding, metadata2.categorical_metadata[column2].category_embedding + ) + count1 = metadata1.categorical_metadata[column1].count_categories + count2 = metadata2.categorical_metadata[column2].count_categories + count_re.loc[column1, column2] = count1 / count2 if count1 < count2 else count2 / count1 + # todo compare categories_with_count for metadata1 and metadata2 + # firstly normalize dictionary categories_with_count then + # compare the difference between the two dictionaries + return concat(value_re, count_re) + + def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame: + """ + Compare kind columns + :param metadata1: first table + :param metadata2: second table + :return: dataframe full of numbers between 0 and 1 + """ + result = 0 + if DataKind.BOOL in self.compare_kind: + bools = self.compare_bools(metadata1, metadata2) + result += self.compute_result( + bools, + self.distance_function, + self.settings, + self.kind_weight[DataKind.BOOL], + ) + if DataKind.CONSTANT in self.compare_kind: + constants = self.compare_constants(metadata1, metadata2) + result += self.compute_result( + constants, + self.distance_function, + self.settings, + self.kind_weight[DataKind.CONSTANT], + ) + if DataKind.ID in self.compare_kind: + ids = self.compare_ids(metadata1, metadata2) + result += self.compute_result( + ids, + self.distance_function, + self.settings, + self.kind_weight[DataKind.ID], + ) + if DataKind.CATEGORICAL in self.compare_kind: + categorical = self.compare_categorical(metadata1, metadata2) + result += self.compute_result( + categorical, + self.distance_function, + self.settings, + self.kind_weight[DataKind.CATEGORICAL], + ) + return pd.DataFrame([result]) \ No newline at end of file diff --git a/similarity_framework/src/interfaces/comparator/comparator.py b/similarity_framework/src/interfaces/comparator/comparator.py index fe4e1fa..590bf1e 100644 --- a/similarity_framework/src/interfaces/comparator/comparator.py +++ b/similarity_framework/src/interfaces/comparator/comparator.py @@ -1,30 +1,12 @@ from abc import abstractmethod, ABC -import pandas as pd - from similarity_framework.src.interfaces.common import DistanceFunction from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin +from similarity_framework.src.interfaces.comparator.handler import HandlerType from similarity_framework.src.models.metadata import Metadata from similarity_framework.src.models.similarity import Settings, SimilarityOutput from similarity_framework.src.models.settings import AnalysisSettings - -class HandlerType(ABC): - """Abstract class for comparators""" - - def __init__(self, weight: int = 1, analysis_settings: AnalysisSettings = None): - """ - Constructor for ComparatorType - :param weight: weight of the comparator - """ - self.weight: int = weight - self.analysis_settings: AnalysisSettings = analysis_settings - - @abstractmethod - def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame | float: - """This method should compare two tables and return distance table""" - - class Comparator(ABC): """ Abstract Comparator class diff --git a/similarity_framework/src/interfaces/comparator/handler.py b/similarity_framework/src/interfaces/comparator/handler.py new file mode 100644 index 0000000..742affc --- /dev/null +++ b/similarity_framework/src/interfaces/comparator/handler.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod + +import pandas as pd + +from similarity_framework.src.models.metadata import Metadata +from similarity_framework.src.models.settings import AnalysisSettings + + +class HandlerType(ABC): + """Abstract class for comparators""" + + def __init__(self, weight: int = 1, analysis_settings: AnalysisSettings = None): + """ + Constructor for ComparatorType + :param weight: weight of the comparator + """ + self.weight: int = weight + self.analysis_settings: AnalysisSettings = analysis_settings + + @abstractmethod + def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame | float: + """This method should compare two tables and return distance table""" + From 23babc3d8fddbca993fc4d6f43ad2ed59617c61a Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Mon, 9 Dec 2024 12:43:32 +0100 Subject: [PATCH 04/11] Change sentance transformer model --- .../src/impl/metadata/type_metadata_creator.py | 4 ++-- tests/column2vec/test_column2vec.py | 2 +- tests/column2vec/test_column2vec_cache.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/similarity_framework/src/impl/metadata/type_metadata_creator.py b/similarity_framework/src/impl/metadata/type_metadata_creator.py index 73de7a8..92d4ec5 100644 --- a/similarity_framework/src/impl/metadata/type_metadata_creator.py +++ b/similarity_framework/src/impl/metadata/type_metadata_creator.py @@ -93,7 +93,7 @@ def __init__(self): True for incomplete data and False otherwise """ super().__init__() - self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={"clean_up_tokenization_spaces": True}) + self.model: Optional[SentenceTransformer] = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', tokenizer_kwargs={"clean_up_tokenization_spaces": True}) def __normalize(self, num1: int, num2: int) -> tuple[int, int]: """ @@ -165,7 +165,7 @@ def get_model(self) -> SentenceTransformer: :return: embedding model if exists or creates new one """ if not self.model: - self.model = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={"clean_up_tokenization_spaces": True}) + self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', tokenizer_kwargs={"clean_up_tokenization_spaces": True}) return self.model # Setting Creator diff --git a/tests/column2vec/test_column2vec.py b/tests/column2vec/test_column2vec.py index e55b59f..fbac164 100644 --- a/tests/column2vec/test_column2vec.py +++ b/tests/column2vec/test_column2vec.py @@ -17,7 +17,7 @@ SKIP_SIMILAR = False # alternative model # MODEL = 'all-mpnet-base-v2' # bert-base-nli-mean-tokens -MODEL = 'bert-base-nli-mean-tokens' # +MODEL = 'sentence-transformers/all-mpnet-base-v2' # THIS_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/tests/column2vec/test_column2vec_cache.py b/tests/column2vec/test_column2vec_cache.py index 4fe04bd..287c881 100644 --- a/tests/column2vec/test_column2vec_cache.py +++ b/tests/column2vec/test_column2vec_cache.py @@ -8,7 +8,7 @@ column2vec_weighted_avg, column2vec_sum, column2vec_weighted_sum from column2vec.src.functions import get_nonnumerical_data -MODEL = 'bert-base-nli-mean-tokens' +MODEL = 'sentence-transformers/all-mpnet-base-v2' THIS_DIR = os.path.dirname(os.path.abspath(__file__)) From eb6d191f0b3e6f718ea360ffbbf4ddd63671a01b Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Mon, 9 Dec 2024 12:43:44 +0100 Subject: [PATCH 05/11] Fix tests --- similarity_framework/main.py | 8 ++-- tests/runner/test_runner_connectors.py | 28 ++++++------ tests/runner/test_runner_formators.py | 28 ------------ tests/runner/test_runner_functions.py | 44 +++++++++---------- .../test_similarity_comparator.py | 13 +++--- 5 files changed, 45 insertions(+), 76 deletions(-) delete mode 100644 tests/runner/test_runner_formators.py diff --git a/similarity_framework/main.py b/similarity_framework/main.py index 9bed26a..dc23a7f 100644 --- a/similarity_framework/main.py +++ b/similarity_framework/main.py @@ -22,10 +22,12 @@ def create_metadata(data): - return (TypeMetadataCreator(data).compute_advanced_structural_types().compute_column_kind().compute_column_names_embeddings()).get_metadata() + return (TypeMetadataCreator(). + compute_advanced_structural_types() + .compute_column_kind().compute_column_names_embeddings()).get_metadata(data) -def compare_datasets(path1, path2): +def compare_datasets(path1:str, path2): """ This function compare two tables It will read datasets, create metadata and comparator, compare them @@ -41,7 +43,7 @@ def compare_datasets(path1, path2): ComparatorByColumn() ## different option # .add_comparator_type(SizeComparatorByColumn()) - .add_comparator_type(IncompleteColumnsComparatorByColumn()).add_comparator_type(ColumnNamesEmbeddingsComparatorByColumn()) + .add_comparator_type(IncompleteColumnsComparatorByColumn()).add_comparator_type(ColumnNamesEmbeddingsHandler()) ## different option # .add_comparator_type(ColumnKindHandler()) ) diff --git a/tests/runner/test_runner_connectors.py b/tests/runner/test_runner_connectors.py index f61de40..18547e9 100644 --- a/tests/runner/test_runner_connectors.py +++ b/tests/runner/test_runner_connectors.py @@ -1,27 +1,25 @@ import unittest -from src.connectors.filesystem_connector import FilesystemConnector -from src.models import FSConnectorSettings +from similarity_runner.src.impl.filesystem_connector import FilesystemConnector, FSConnectorSettings class TestFileSystemConnector(unittest.TestCase): def test_get_data_files(self): connector = FilesystemConnector() - settings = FSConnectorSettings(files_paths=["./data/netflix_titles.csv", - "./data/disney_movies.csv"], - directory_paths=[], - file_type=("csv",)) - data, names = connector.get_data(settings) + settings = FSConnectorSettings(files_paths="../data/netflix_titles.csv,../data/disney_movies.csv", + directory_paths="", + filetypes="csv") + res = connector.get_data(settings) connector.close() - self.assertEqual(len(data), 2) - self.assertEqual(names[0], "./data/netflix_titles") - self.assertEqual(names[1], "./data/disney_movies") + self.assertEqual(len(res), 2) + self.assertEqual(res[0].source_name, "../data/netflix_titles") + self.assertEqual(res[1].source_name, "../data/disney_movies") def test_get_data_folder(self): connector = FilesystemConnector() - settings = FSConnectorSettings(files_paths=[], - directory_paths=["./data"], - file_type=("csv",)) - data, _ = connector.get_data(settings) + settings = FSConnectorSettings(files_paths="", + directory_paths="../data", + filetypes="csv") + data = connector.get_data(settings) connector.close() - self.assertEqual(len(data), 11) + self.assertEqual(len(data), 13) diff --git a/tests/runner/test_runner_formators.py b/tests/runner/test_runner_formators.py deleted file mode 100644 index 8db65d8..0000000 --- a/tests/runner/test_runner_formators.py +++ /dev/null @@ -1,28 +0,0 @@ -import unittest - -from src.formators import JsonFormater - - -class TestJsonFormater(unittest.TestCase): - def test_format_data(self): - data = {"a": {"b": 0.5, "c": 0.3}, - "b": {"a": 0.5, "c": 0.8}, - "c": {"a": 0.3, "b": 0.8}} - formater = JsonFormater() - jsondata = formater.format(data) - self.assertEqual(jsondata, '''{ - "a": { - "b": 0.5, - "c": 0.3 - }, - "b": { - "a": 0.5, - "c": 0.8 - }, - "c": { - "a": 0.3, - "b": 0.8 - } -}''') - - diff --git a/tests/runner/test_runner_functions.py b/tests/runner/test_runner_functions.py index fb62df2..4d529e6 100644 --- a/tests/runner/test_runner_functions.py +++ b/tests/runner/test_runner_functions.py @@ -17,35 +17,35 @@ def csv_to_parquet(file: str, sep: str = ',') -> str: class TestLoadFilesFromList(unittest.TestCase): def test_load_csv_file(self): - data, names = load_files_from_list(["./data/netflix_titles.csv"], (FileType.CSV, )) - self.assertEqual(len(data), 1) - self.assertEqual(names[0], "./data/netflix_titles") + res = load_files_from_list(["../data/netflix_titles.csv"], (FileType.CSV, )) + self.assertEqual(len(res), 1) + self.assertEqual(res[0].source_name, "../data/netflix_titles") def test_load_csv_files(self): - data, names = load_files_from_list(["./data/netflix_titles.csv", "./data/disney_movies.csv"], (FileType.CSV, )) - self.assertEqual(len(data), 2) - self.assertEqual(names[0], "./data/netflix_titles") - self.assertEqual(names[1], "./data/disney_movies") + res = load_files_from_list(["../data/netflix_titles.csv", "../data/disney_movies.csv"], (FileType.CSV, )) + self.assertEqual(len(res), 2) + self.assertEqual(res[0].source_name, "../data/netflix_titles") + self.assertEqual(res[1].source_name, "../data/disney_movies") def test_load_parquet_file(self): - csv_to_parquet("./data/netflix_titles.csv") - data, names = load_files_from_list(["./data/netflix_titles.parquet"], (FileType.PARQUET, )) - self.assertEqual(len(data), 1) - self.assertEqual(names[0], "./data/netflix_titles") + csv_to_parquet("../data/netflix_titles.csv") + res = load_files_from_list(["../data/netflix_titles.parquet"], (FileType.PARQUET, )) + self.assertEqual(len(res), 1) + self.assertEqual(res[0].source_name, "../data/netflix_titles") def test_load_parquet_files(self): - csv_to_parquet("./data/netflix_titles.csv") - csv_to_parquet("./data/disney_movies.csv") - data, names = load_files_from_list(["./data/netflix_titles.parquet", "./data/disney_movies.parquet"], (FileType.PARQUET, )) - self.assertEqual(len(data), 2) - self.assertEqual(names[0], "./data/netflix_titles") - self.assertEqual(names[1], "./data/disney_movies") + csv_to_parquet("../data/netflix_titles.csv") + csv_to_parquet("../data/disney_movies.csv") + res = load_files_from_list(["../data/netflix_titles.parquet", "../data/disney_movies.parquet"], (FileType.PARQUET, )) + self.assertEqual(len(res), 2) + self.assertEqual(res[0].source_name, "../data/netflix_titles") + self.assertEqual(res[1].source_name, "../data/disney_movies") def test_load_csv_and_parquet_files(self): - csv_to_parquet("./data/netflix_titles.csv") - data, names = load_files_from_list(["./data/netflix_titles.parquet", "./data/disney_movies.csv"], (FileType.PARQUET, FileType.CSV)) - self.assertEqual(len(data), 2) - self.assertEqual(names[0], "./data/netflix_titles") - self.assertEqual(names[1], "./data/disney_movies") + csv_to_parquet("../data/netflix_titles.csv") + res = load_files_from_list(["../data/netflix_titles.parquet", "../data/disney_movies.csv"], (FileType.PARQUET, FileType.CSV)) + self.assertEqual(len(res), 2) + self.assertEqual(res[0].source_name, "../data/netflix_titles") + self.assertEqual(res[1].source_name, "../data/disney_movies") diff --git a/tests/similarity_framework/test_similarity_comparator.py b/tests/similarity_framework/test_similarity_comparator.py index c56c296..32e01f6 100644 --- a/tests/similarity_framework/test_similarity_comparator.py +++ b/tests/similarity_framework/test_similarity_comparator.py @@ -3,8 +3,9 @@ import pandas as pd -from similarity_framework.src.impl.comparator.comparator_by_type import HausdorffDistanceMin, SizeHandler, get_ratio, ComparatorByType, \ - ColumnExactNamesHandler, ColumnNamesEmbeddingsHandler, IncompleteColumnsHandler, KindHandler +from similarity_framework.src.impl.comparator.comparator_by_type import ComparatorByType +from similarity_framework.src.impl.comparator.handlers import HausdorffDistanceMin, SizeHandler, get_ratio, \ + ColumnExactNamesHandler, ColumnNamesEmbeddingsHandler, IncompleteColumnsHandler from similarity_framework.src.impl.comparator.comparator_by_column import (ComparatorByColumn, SizeHandler as SizeHandlerByColumn, IncompleteColumnsHandler as IncompleteColumnsHandlerByColumn, ColumnNamesEmbeddingsHandler as ColumnNamesEmbeddingsHandlerByColumn, @@ -134,12 +135,8 @@ def setUp(self): self.data_second_half.index = self.data_second_half.index - int(len(self.data) / 2) self.data_diff_type = self.data.copy() # todo fill - self.metadata_creator = (TypeMetadataCreator() - .compute_advanced_structural_types() - .compute_column_kind() - .compute_advanced_structural_types() - .compute_incomplete_column() - .compute_column_names_embeddings()) + self.metadata_creator = TypeMetadataCreator() + self.metadata_creator.compute_advanced_structural_types().compute_column_kind().compute_incomplete_column().compute_column_names_embeddings() self.metadata1 = self.metadata_creator.get_metadata(MetadataCreatorInput(dataframe=self.data)) self.metadata_diff_column_names = self.metadata_creator.get_metadata(MetadataCreatorInput(dataframe=self.data_diff_column_names)) self.metadata_first_half = self.metadata_creator.get_metadata(MetadataCreatorInput(dataframe=self.data_first_half)) From 6f8f0a4e998257a121591c1b56abaf7beeb23264 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Mon, 9 Dec 2024 12:45:19 +0100 Subject: [PATCH 06/11] Add runner tests to pipeline --- .github/workflows/py_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/py_test.yml b/.github/workflows/py_test.yml index 1fa335f..7ce5c44 100644 --- a/.github/workflows/py_test.yml +++ b/.github/workflows/py_test.yml @@ -51,7 +51,7 @@ jobs: python-tests: env: - TEST_FILES: tests/similarity_framework/test_similarity* tests/column2vec/test_column2vec_cache.py + TEST_FILES: tests/similarity_framework/test_similarity* tests/column2vec/test_column2vec_cache.py tests/runner/test_runner* name: Run Python Tests runs-on: ubuntu-latest steps: From 7f67d8bed7ef6cb2d53dc9747854f1c63ff435b4 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Mon, 9 Dec 2024 14:17:47 +0100 Subject: [PATCH 07/11] Fix tests --- tests/runner/test_runner_connectors.py | 14 ++++++--- tests/runner/test_runner_functions.py | 43 ++++++++++++++++---------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/tests/runner/test_runner_connectors.py b/tests/runner/test_runner_connectors.py index 18547e9..440a8e6 100644 --- a/tests/runner/test_runner_connectors.py +++ b/tests/runner/test_runner_connectors.py @@ -1,24 +1,30 @@ +import os import unittest from similarity_runner.src.impl.filesystem_connector import FilesystemConnector, FSConnectorSettings +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) class TestFileSystemConnector(unittest.TestCase): + def setUp(self): + self.file1 = os.path.join(THIS_DIR, '../data/netflix_titles.csv') + self.file2 = os.path.join(THIS_DIR, '../data/disney_movies.csv') + self.dir = os.path.join(THIS_DIR, '../data') def test_get_data_files(self): connector = FilesystemConnector() - settings = FSConnectorSettings(files_paths="../data/netflix_titles.csv,../data/disney_movies.csv", + settings = FSConnectorSettings(files_paths=self.file1 + "," + self.file2, directory_paths="", filetypes="csv") res = connector.get_data(settings) connector.close() self.assertEqual(len(res), 2) - self.assertEqual(res[0].source_name, "../data/netflix_titles") - self.assertEqual(res[1].source_name, "../data/disney_movies") + self.assertEqual(res[0].source_name, self.file1.replace(".csv", "")) + self.assertEqual(res[1].source_name, self.file2.replace(".csv", "")) def test_get_data_folder(self): connector = FilesystemConnector() settings = FSConnectorSettings(files_paths="", - directory_paths="../data", + directory_paths=self.dir, filetypes="csv") data = connector.get_data(settings) connector.close() diff --git a/tests/runner/test_runner_functions.py b/tests/runner/test_runner_functions.py index 4d529e6..d081119 100644 --- a/tests/runner/test_runner_functions.py +++ b/tests/runner/test_runner_functions.py @@ -1,3 +1,4 @@ +import os import unittest import pandas as pd @@ -5,6 +6,7 @@ from similarity_runner.src.impl.filesystem_connector import load_files_from_list from similarity_runner.src.models.connectors import FileType +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) def csv_to_parquet(file: str, sep: str = ',') -> str: """ @@ -16,36 +18,43 @@ def csv_to_parquet(file: str, sep: str = ',') -> str: return file.replace(".csv", ".parquet") class TestLoadFilesFromList(unittest.TestCase): + def setUp(self): + self.netflix_file = os.path.join(THIS_DIR, '../data/netflix_titles.csv') + self.netflix_file_parquet = os.path.join(THIS_DIR, '../data/netflix_titles.parquet') + self.disney_file = os.path.join(THIS_DIR, '../data/disney_movies.csv') + self.disney_file_parquet = os.path.join(THIS_DIR, '../data/disney_movies.parquet') + self.dir = os.path.join(THIS_DIR, '../data') + def test_load_csv_file(self): - res = load_files_from_list(["../data/netflix_titles.csv"], (FileType.CSV, )) + res = load_files_from_list([self.netflix_file], (FileType.CSV,)) self.assertEqual(len(res), 1) - self.assertEqual(res[0].source_name, "../data/netflix_titles") + self.assertEqual(res[0].source_name, self.netflix_file.replace(".csv", "")) def test_load_csv_files(self): - res = load_files_from_list(["../data/netflix_titles.csv", "../data/disney_movies.csv"], (FileType.CSV, )) + res = load_files_from_list([self.netflix_file, self.disney_file], (FileType.CSV,)) self.assertEqual(len(res), 2) - self.assertEqual(res[0].source_name, "../data/netflix_titles") - self.assertEqual(res[1].source_name, "../data/disney_movies") + self.assertEqual(res[0].source_name, self.netflix_file.replace(".csv", "")) + self.assertEqual(res[1].source_name, self.disney_file.replace(".csv", "")) def test_load_parquet_file(self): - csv_to_parquet("../data/netflix_titles.csv") - res = load_files_from_list(["../data/netflix_titles.parquet"], (FileType.PARQUET, )) + csv_to_parquet(self.netflix_file) + res = load_files_from_list([self.netflix_file_parquet], (FileType.PARQUET, )) self.assertEqual(len(res), 1) - self.assertEqual(res[0].source_name, "../data/netflix_titles") + self.assertEqual(res[0].source_name, self.netflix_file_parquet.replace(".parquet", "")) def test_load_parquet_files(self): - csv_to_parquet("../data/netflix_titles.csv") - csv_to_parquet("../data/disney_movies.csv") - res = load_files_from_list(["../data/netflix_titles.parquet", "../data/disney_movies.parquet"], (FileType.PARQUET, )) + csv_to_parquet(self.netflix_file) + csv_to_parquet(self.disney_file) + res = load_files_from_list([self.netflix_file_parquet, self.disney_file_parquet], (FileType.PARQUET, )) self.assertEqual(len(res), 2) - self.assertEqual(res[0].source_name, "../data/netflix_titles") - self.assertEqual(res[1].source_name, "../data/disney_movies") + self.assertEqual(res[0].source_name, self.netflix_file_parquet.replace(".parquet", "")) + self.assertEqual(res[1].source_name, self.disney_file_parquet.replace(".parquet", "")) def test_load_csv_and_parquet_files(self): - csv_to_parquet("../data/netflix_titles.csv") - res = load_files_from_list(["../data/netflix_titles.parquet", "../data/disney_movies.csv"], (FileType.PARQUET, FileType.CSV)) + csv_to_parquet(self.netflix_file) + res = load_files_from_list([self.netflix_file_parquet, self.disney_file], (FileType.PARQUET, FileType.CSV)) self.assertEqual(len(res), 2) - self.assertEqual(res[0].source_name, "../data/netflix_titles") - self.assertEqual(res[1].source_name, "../data/disney_movies") + self.assertEqual(res[0].source_name, self.netflix_file_parquet.replace(".parquet", "")) + self.assertEqual(res[1].source_name, self.disney_file.replace(".csv", "")) From 03ce876add589136d46015ff8efcb5435bf5f570 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Mon, 9 Dec 2024 15:22:15 +0100 Subject: [PATCH 08/11] Fix tests --- .../src/impl/comparator/handlers.py | 14 ++++------- .../test_similarity_comparator.py | 24 ++++++++++++++++++- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/similarity_framework/src/impl/comparator/handlers.py b/similarity_framework/src/impl/comparator/handlers.py index 7b54dd8..3286d43 100644 --- a/similarity_framework/src/impl/comparator/handlers.py +++ b/similarity_framework/src/impl/comparator/handlers.py @@ -179,7 +179,7 @@ def __init__(self, compare_kind=None, weight=1): else: self.kind_weight = weight - def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # todo add type + def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: """ Creates table of distances between embeddings for each row and computes mean of row and column minimums then pick max. @@ -463,7 +463,6 @@ def __create_dist_matrix(self, embeddings1: list[Tensor], embeddings2: list[Tens for embed1 in embeddings1: siml_line = [] for embed2 in embeddings2: - # todo rounding for 3 digits ? ok -> two because of minus 0 siml_line.append( round( 1 @@ -504,10 +503,9 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data ratio = get_ratio(categorical1.count_categories, categorical1.count_categories) result.loc[id1, id2] = dist * ratio name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2]) - # todo p value or correlation return concat(result, name_distance) -class CategoricalHandlerSimilar(CategoricalHandler): +class CategoricalHandlerSimilar(CategoricalHandler):# pragma: no cover """ Handler for column category """ @@ -550,14 +548,13 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data for id2, (column2, categorical2) in enumerate(metadata2.categorical_metadata.items()): simil_matrix = self.__create_sim_matrix(categorical1.category_embedding, categorical2.category_embedding) _, score = self.__compute_similarity_score(simil_matrix) - ratio = get_ratio(categorical1.count_categories, categorical1.count_categories) # todo 1-ratio??? + ratio = get_ratio(categorical1.count_categories, categorical1.count_categories) result.loc[id1, id2] = 1 - (score * ratio) name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2]) - # todo p value or correlation return concat(result, name_distance) -class KindHandlerOldByType(HandlerType): +class KindHandlerOldByType(HandlerType):# pragma: no cover """ Handler for column kind """ @@ -659,7 +656,7 @@ def compare_constants(self, metadata1: Metadata, metadata2: Metadata) -> pd.Data value_re.loc[column1, column2] = int(meta1.value != meta2.value) else: value_re.loc[column1, column2] = 1 - cosine_sim( - meta1.value_embeddings[0], # todo 0 nebo 1 + meta1.value_embeddings[0], meta2.value_embeddings[0], ) @@ -767,7 +764,6 @@ def compare_categorical(self, metadata1: Metadata, metadata2: Metadata) -> pd.Da count1 = metadata1.categorical_metadata[column1].count_categories count2 = metadata2.categorical_metadata[column2].count_categories count_re.loc[column1, column2] = count1 / count2 if count1 < count2 else count2 / count1 - # todo compare categories_with_count for metadata1 and metadata2 # firstly normalize dictionary categories_with_count then # compare the difference between the two dictionaries return concat(value_re, count_re) diff --git a/tests/similarity_framework/test_similarity_comparator.py b/tests/similarity_framework/test_similarity_comparator.py index 32e01f6..f29dc9a 100644 --- a/tests/similarity_framework/test_similarity_comparator.py +++ b/tests/similarity_framework/test_similarity_comparator.py @@ -2,6 +2,8 @@ import unittest import pandas as pd +from pyarrow import Tensor +from sentence_transformers import SentenceTransformer from similarity_framework.src.impl.comparator.comparator_by_type import ComparatorByType from similarity_framework.src.impl.comparator.handlers import HausdorffDistanceMin, SizeHandler, get_ratio, \ @@ -14,7 +16,7 @@ ) from similarity_framework.src.impl.comparator.distance_functions import AverageDist from similarity_framework.src.impl.comparator.utils import concat, cosine_sim, fill_result, are_columns_null, create_string_from_columns -from similarity_framework.src.models.metadata import MetadataCreatorInput +from similarity_framework.src.models.metadata import MetadataCreatorInput, Metadata, CategoricalMetadata from similarity_framework.src.models.similarity import Settings from similarity_framework.src.impl.metadata.type_metadata_creator import TypeMetadataCreator from similarity_framework.src.models.types_ import DataKind @@ -145,6 +147,8 @@ def setUp(self): self.compartor.types_compare = False self.compartor.kinds_compare = False + self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') + def test_size_compare(self): self.compartor.add_comparator_type(SizeHandler()) @@ -211,6 +215,24 @@ def test_kind_ID_compare(self): self.compartor.compare(self.metadata1, self.metadata1).distance, 0) self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0) + def test_kind_CATEGORICAL_compare(self): + self.compartor.set_types(False) + metadata = Metadata() + metadata.column_kind[DataKind.CATEGORICAL] = {'column_0', 'column_1'} + metadata.categorical_metadata = {'column_0': + CategoricalMetadata(3, ["One", "Two", "Three"], + pd.Series({'One': 10, 'Two': 5, 'Three': 8}), + [self.model.encode('One'), self.model.encode('Two'), self.model.encode('Three')] + ), + 'column_1': + CategoricalMetadata(3, ["One", "Two", "Three"], + pd.Series({'One': 15, 'Two': 1, 'Three': 7}), + [self.model.encode('One'), self.model.encode('Two'), self.model.encode('Three')] + ) + } + self.compartor.add_comparator_type(ColumnKindHandler(compare_kind=[DataKind.CATEGORICAL])) + self.assertEqual(self.compartor.compare(metadata, metadata).distance, 0) + def test_kind_CONSTANT_compare(self): self.compartor.set_types(False) From 537b22daa58d655a35376328fd2221a0169a54aa Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Mon, 9 Dec 2024 15:23:13 +0100 Subject: [PATCH 09/11] Format with black --- similarity_framework/main.py | 6 +- .../impl/comparator/comparator_by_column.py | 13 ++- .../src/impl/comparator/comparator_by_type.py | 83 ++++++++++--------- .../src/impl/comparator/handlers.py | 30 +++---- .../src/impl/comparator/utils.py | 1 - .../impl/metadata/type_metadata_creator.py | 6 +- .../src/interfaces/comparator/comparator.py | 1 + .../src/interfaces/comparator/handler.py | 1 - similarity_runner/src/interfaces/ui.py | 5 -- 9 files changed, 79 insertions(+), 67 deletions(-) diff --git a/similarity_framework/main.py b/similarity_framework/main.py index dc23a7f..cbc71ee 100644 --- a/similarity_framework/main.py +++ b/similarity_framework/main.py @@ -22,12 +22,10 @@ def create_metadata(data): - return (TypeMetadataCreator(). - compute_advanced_structural_types() - .compute_column_kind().compute_column_names_embeddings()).get_metadata(data) + return (TypeMetadataCreator().compute_advanced_structural_types().compute_column_kind().compute_column_names_embeddings()).get_metadata(data) -def compare_datasets(path1:str, path2): +def compare_datasets(path1: str, path2): """ This function compare two tables It will read datasets, create metadata and comparator, compare them diff --git a/similarity_framework/src/impl/comparator/comparator_by_column.py b/similarity_framework/src/impl/comparator/comparator_by_column.py index 36bf2e8..cf44559 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_column.py +++ b/similarity_framework/src/impl/comparator/comparator_by_column.py @@ -3,15 +3,22 @@ from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin, AverageDist -from similarity_framework.src.impl.comparator.handlers import SizeHandler, IncompleteColumnsHandler, ColumnExactNamesHandler, ColumnNamesEmbeddingsHandler, \ - ColumnEmbeddingsHandler, ColumnKindHandler, ColumnTypeHandler, TableHandler +from similarity_framework.src.impl.comparator.handlers import ( + SizeHandler, + IncompleteColumnsHandler, + ColumnExactNamesHandler, + ColumnNamesEmbeddingsHandler, + ColumnEmbeddingsHandler, + ColumnKindHandler, + ColumnTypeHandler, + TableHandler, +) from similarity_framework.src.interfaces.comparator.comparator import HandlerType, Comparator from similarity_framework.src.models.metadata import Metadata from similarity_framework.src.models.similarity import SimilarityOutput from similarity_framework.src.models.settings import AnalysisSettings - class ComparatorByColumn(Comparator): """ Comparator for comparing two tables diff --git a/similarity_framework/src/impl/comparator/comparator_by_type.py b/similarity_framework/src/impl/comparator/comparator_by_type.py index af16931..2cfe6a1 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_type.py +++ b/similarity_framework/src/impl/comparator/comparator_by_type.py @@ -1,22 +1,25 @@ from __future__ import annotations import math -from statistics import mean import numpy as np import pandas as pd -from torch import Tensor from logging_ import logger -from similarity_framework.src.impl.comparator.comparator_by_column import ColumnTypeHandler, IncompleteColumnsHandler, ColumnExactNamesHandler, \ - ColumnNamesEmbeddingsHandler, ColumnEmbeddingsHandler, SizeHandler, ColumnKindHandler -from similarity_framework.src.impl.comparator.utils import cosine_sim, get_ratio, concat, fill_result -from similarity_framework.src.interfaces.common import DistanceFunction +from similarity_framework.src.impl.comparator.comparator_by_column import ( + ColumnTypeHandler, + IncompleteColumnsHandler, + ColumnExactNamesHandler, + ColumnNamesEmbeddingsHandler, + ColumnEmbeddingsHandler, + SizeHandler, + ColumnKindHandler, +) +from similarity_framework.src.impl.comparator.utils import get_ratio, concat from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin, AverageDist from similarity_framework.src.interfaces.comparator.comparator import HandlerType, Comparator from similarity_framework.src.models.metadata import Metadata from similarity_framework.src.models.similarity import SimilarityOutput, Settings -from similarity_framework.src.models.types_ import DataKind, Type from similarity_framework.src.models.settings import AnalysisSettings @@ -59,6 +62,7 @@ def __init__(self): self.types_compare = True self.kind_weight = 1 self.type_weight = 1 + def set_kinds(self, value: bool) -> "ComparatorByType": """ Set if kinds should be compared @@ -80,9 +84,9 @@ def add_comparator_type(self, comparator: HandlerType) -> "ComparatorByType": self.comparator_type.append(comparator) return self - def __compare_all_columns(self, metadata1: Metadata, metadata2: Metadata, - column_names1: set[str], column_names2: set[str], - comparators: list[HandlerType]) -> pd.DataFrame: + def __compare_all_columns( + self, metadata1: Metadata, metadata2: Metadata, column_names1: set[str], column_names2: set[str], comparators: list[HandlerType] + ) -> pd.DataFrame: all_compares = [] for comparator in comparators: col_to_col = pd.DataFrame() @@ -91,25 +95,22 @@ def __compare_all_columns(self, metadata1: Metadata, metadata2: Metadata, result = comparator.compare(metadata1, metadata2, index1=name1, index2=name2) if result is not np.nan: col_to_col.loc[idx1, idx2] = result - if not col_to_col.empty: all_compares.append(col_to_col) # todo add , comparator.weight + if not col_to_col.empty: + all_compares.append(col_to_col) # todo add , comparator.weight return pd.DataFrame if all_compares == [] else concat(*all_compares) def __compare_types(self, type_, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: comparators = self.comparator_type.copy() - if self.types_compare: comparators.append(ColumnTypeHandler()) - all_compares = self.__compare_all_columns(metadata1, metadata2, - metadata1.column_type[type_], - metadata2.column_type[type_], - comparators) + if self.types_compare: + comparators.append(ColumnTypeHandler()) + all_compares = self.__compare_all_columns(metadata1, metadata2, metadata1.column_type[type_], metadata2.column_type[type_], comparators) return all_compares def __compare_kinds(self, kind, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame: comparators = self.comparator_type.copy() - if self.kinds_compare: comparators.append(ColumnKindHandler()) - all_compares = self.__compare_all_columns(metadata1, metadata2, - metadata1.column_kind[kind], - metadata2.column_kind[kind], - comparators) + if self.kinds_compare: + comparators.append(ColumnKindHandler()) + all_compares = self.__compare_all_columns(metadata1, metadata2, metadata1.column_kind[kind], metadata2.column_kind[kind], comparators) return all_compares def _compare(self, metadata1: Metadata, metadata2: Metadata) -> SimilarityOutput: @@ -123,35 +124,43 @@ def _compare(self, metadata1: Metadata, metadata2: Metadata) -> SimilarityOutput continue dist_table = self.__compare_types(type_, metadata1, metadata2) if not dist_table.empty: - distances.append((self.distance_function.compute(dist_table), - get_ratio( - dist_table.shape[0], - dist_table.shape[1], - ), - self.type_weight)) + distances.append( + ( + self.distance_function.compute(dist_table), + get_ratio( + dist_table.shape[0], + dist_table.shape[1], + ), + self.type_weight, + ) + ) if self.kinds: for kind in metadata1.column_kind.keys(): - if metadata1.column_kind[kind] != () and metadata2.column_kind[kind] != (): + if metadata1.column_kind[kind] != () and metadata2.column_kind[kind] != (): dist_table = self.__compare_kinds(kind, metadata1, metadata2) if not dist_table.empty: - distances.append((self.distance_function.compute(dist_table), - get_ratio( - dist_table.shape[0], - dist_table.shape[1], - ), - self.kind_weight)) + distances.append( + ( + self.distance_function.compute(dist_table), + get_ratio( + dist_table.shape[0], + dist_table.shape[1], + ), + self.kind_weight, + ) + ) result = 0 nan = 0 - sum_weight = sum([weight for _,_, weight in distances if not np.isnan(weight)]) + sum_weight = sum([weight for _, _, weight in distances if not np.isnan(weight)]) for dist, ratio, weight in distances: if math.isnan(dist): nan += 1 continue if Settings.NO_RATIO in self.settings: - result += dist * dist * weight/sum_weight + result += dist * dist * weight / sum_weight else: - result += dist * dist * ratio * weight/sum_weight + result += dist * dist * ratio * weight / sum_weight if nan == len(distances): return SimilarityOutput(distance=1) return SimilarityOutput(distance=np.sqrt(result)) diff --git a/similarity_framework/src/impl/comparator/handlers.py b/similarity_framework/src/impl/comparator/handlers.py index 3286d43..da83491 100644 --- a/similarity_framework/src/impl/comparator/handlers.py +++ b/similarity_framework/src/impl/comparator/handlers.py @@ -221,8 +221,16 @@ def compare_bools( :return: float number in range <0, 1> """ nulls = 0 if metadata1.nulls == metadata2.nulls else 1 - dist1 = metadata1.distribution[0] / metadata1.distribution[1] if metadata1.distribution[1] > metadata1.distribution[0] else metadata1.distribution[1] / metadata1.distribution[0] - dist2 = metadata2.distribution[0] / metadata2.distribution[1] if metadata2.distribution[1] > metadata2.distribution[0] else metadata2.distribution[1] / metadata2.distribution[0] + dist1 = ( + metadata1.distribution[0] / metadata1.distribution[1] + if metadata1.distribution[1] > metadata1.distribution[0] + else metadata1.distribution[1] / metadata1.distribution[0] + ) + dist2 = ( + metadata2.distribution[0] / metadata2.distribution[1] + if metadata2.distribution[1] > metadata2.distribution[0] + else metadata2.distribution[1] / metadata2.distribution[0] + ) distr = abs(dist1 - dist2) if metadata1.value_embeddings is None or metadata2.value_embeddings is None: return (nulls + distr) / 2 @@ -374,9 +382,7 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, class ColumnTypeHandler(SpecificColumnHandler): - def __numerical_compare1( - self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int - ) -> float: + def __numerical_compare1(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int) -> float: num_met1 = metadata1.numerical_metadata[index1] num_met2 = metadata2.numerical_metadata[index2] if num_met1.same_value_length == num_met2.same_value_length: @@ -393,9 +399,7 @@ def __numerical_compare1( score += 2 return 1 - score / 9 - def __nonnumerical_compare1( - self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int - ) -> float: + def __nonnumerical_compare1(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int) -> float: num_met1 = metadata1.nonnumerical_metadata[index1] num_met2 = metadata2.nonnumerical_metadata[index2] if num_met1.longest == num_met2.longest or num_met1.longest is num_met2.longest: @@ -431,9 +435,6 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str, return 1 - - - class CategoricalHandler(HandlerType): """ Categorical Handler class @@ -505,7 +506,8 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2]) return concat(result, name_distance) -class CategoricalHandlerSimilar(CategoricalHandler):# pragma: no cover + +class CategoricalHandlerSimilar(CategoricalHandler): # pragma: no cover """ Handler for column category """ @@ -554,7 +556,7 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data return concat(result, name_distance) -class KindHandlerOldByType(HandlerType):# pragma: no cover +class KindHandlerOldByType(HandlerType): # pragma: no cover """ Handler for column kind """ @@ -808,4 +810,4 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data self.settings, self.kind_weight[DataKind.CATEGORICAL], ) - return pd.DataFrame([result]) \ No newline at end of file + return pd.DataFrame([result]) diff --git a/similarity_framework/src/impl/comparator/utils.py b/similarity_framework/src/impl/comparator/utils.py index af9d74f..c302395 100644 --- a/similarity_framework/src/impl/comparator/utils.py +++ b/similarity_framework/src/impl/comparator/utils.py @@ -1,4 +1,3 @@ -import logging import os import numpy as np diff --git a/similarity_framework/src/impl/metadata/type_metadata_creator.py b/similarity_framework/src/impl/metadata/type_metadata_creator.py index 92d4ec5..5608842 100644 --- a/similarity_framework/src/impl/metadata/type_metadata_creator.py +++ b/similarity_framework/src/impl/metadata/type_metadata_creator.py @@ -93,7 +93,9 @@ def __init__(self): True for incomplete data and False otherwise """ super().__init__() - self.model: Optional[SentenceTransformer] = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', tokenizer_kwargs={"clean_up_tokenization_spaces": True}) + self.model: Optional[SentenceTransformer] = SentenceTransformer( + "sentence-transformers/all-mpnet-base-v2", tokenizer_kwargs={"clean_up_tokenization_spaces": True} + ) def __normalize(self, num1: int, num2: int) -> tuple[int, int]: """ @@ -165,7 +167,7 @@ def get_model(self) -> SentenceTransformer: :return: embedding model if exists or creates new one """ if not self.model: - self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', tokenizer_kwargs={"clean_up_tokenization_spaces": True}) + self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", tokenizer_kwargs={"clean_up_tokenization_spaces": True}) return self.model # Setting Creator diff --git a/similarity_framework/src/interfaces/comparator/comparator.py b/similarity_framework/src/interfaces/comparator/comparator.py index 590bf1e..9687506 100644 --- a/similarity_framework/src/interfaces/comparator/comparator.py +++ b/similarity_framework/src/interfaces/comparator/comparator.py @@ -7,6 +7,7 @@ from similarity_framework.src.models.similarity import Settings, SimilarityOutput from similarity_framework.src.models.settings import AnalysisSettings + class Comparator(ABC): """ Abstract Comparator class diff --git a/similarity_framework/src/interfaces/comparator/handler.py b/similarity_framework/src/interfaces/comparator/handler.py index 742affc..c70dd79 100644 --- a/similarity_framework/src/interfaces/comparator/handler.py +++ b/similarity_framework/src/interfaces/comparator/handler.py @@ -20,4 +20,3 @@ def __init__(self, weight: int = 1, analysis_settings: AnalysisSettings = None): @abstractmethod def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame | float: """This method should compare two tables and return distance table""" - diff --git a/similarity_runner/src/interfaces/ui.py b/similarity_runner/src/interfaces/ui.py index 5184581..cd5f8c3 100644 --- a/similarity_runner/src/interfaces/ui.py +++ b/similarity_runner/src/interfaces/ui.py @@ -43,8 +43,3 @@ def run(self): result[(first.name, second.name)] = comparator.compare(first, second, analysis_settings) # TODO: based on analysis settings get specified metadata objects self.show(result, analysis_settings) - - - - - From 0fec728de236af9697566bb115972d7982f4a913 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Wed, 8 Jan 2025 16:15:35 +0100 Subject: [PATCH 10/11] Add small changes --- README.md | 5 +++-- .../src/impl/comparator/comparator_by_column.py | 2 +- .../src/impl/comparator/comparator_by_type.py | 7 ++++++- similarity_framework/src/impl/comparator/utils.py | 4 ++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 130ea4a..8d9430f 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,12 @@ ## What is Datasets Similarity? -The Dataset Similarity project deals with the +The Dataset Similarity project deals with the issue of comparing tabular datasets. The idea of the project is that we will have a set of datasets that we want to compare with each other and find out their similarity or distance. -This project mainly focuses on comparing only two tables. +This project mainly focuses on comparing only two tables but it implements `similarity_runner` that can compare more tables. The final similarity is calculated according to the similarity of individual columns based on their metadata. Columns are compared by type and by content. @@ -27,6 +27,7 @@ the main set (training) on which the program is tuned, and a validation set for validating the results. #### Definition of table similarity: +Two tables are similar if they have at least *k* similar columns. ![img_1.png](docs/similarity_def.png) >Parameter **important columns** is user input. > diff --git a/similarity_framework/src/impl/comparator/comparator_by_column.py b/similarity_framework/src/impl/comparator/comparator_by_column.py index cf44559..9b7d976 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_column.py +++ b/similarity_framework/src/impl/comparator/comparator_by_column.py @@ -107,7 +107,7 @@ def _compare(self, metadata1: Metadata, metadata2: Metadata) -> SimilarityOutput res = self.distance_function.compute(distances) res = res * res else: - res = 0 + res = 1 if table_distances: for dist in table_distances: res += dist * dist diff --git a/similarity_framework/src/impl/comparator/comparator_by_type.py b/similarity_framework/src/impl/comparator/comparator_by_type.py index 2cfe6a1..77d1dbf 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_type.py +++ b/similarity_framework/src/impl/comparator/comparator_by_type.py @@ -81,7 +81,12 @@ def add_comparator_type(self, comparator: HandlerType) -> "ComparatorByType": """ Add comparator """ - self.comparator_type.append(comparator) + if comparator == ColumnKindHandler: + self.kinds = True + if comparator == ColumnTypeHandler: + self.types = True + else: + self.comparator_type.append(comparator) return self def __compare_all_columns( diff --git a/similarity_framework/src/impl/comparator/utils.py b/similarity_framework/src/impl/comparator/utils.py index c302395..2629dc7 100644 --- a/similarity_framework/src/impl/comparator/utils.py +++ b/similarity_framework/src/impl/comparator/utils.py @@ -42,8 +42,8 @@ def get_ratio(count1: int, count2: int) -> float: if count1 == 0 or count2 == 0: return 1 if count1 < count2: - return count2 / count1 - return count1 / count2 + return count1 / count2 + return count2 / count1 def fill_result(metadata1_names, metadata2_names) -> pd.DataFrame: From 548328dfdbfd59f736e053b0a0117ba861ed65b1 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Wed, 8 Jan 2025 22:55:49 +0100 Subject: [PATCH 11/11] Add small changes --- .../test_similarity_comparator.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/similarity_framework/test_similarity_comparator.py b/tests/similarity_framework/test_similarity_comparator.py index f29dc9a..adaa665 100644 --- a/tests/similarity_framework/test_similarity_comparator.py +++ b/tests/similarity_framework/test_similarity_comparator.py @@ -1,6 +1,7 @@ import os import unittest +import numpy as np import pandas as pd from pyarrow import Tensor from sentence_transformers import SentenceTransformer @@ -55,10 +56,10 @@ def test_average_dist(self): self.assertEqual(AverageDist().compute(df4), 6/3) def test_get_ratio(self): - self.assertEqual(round(get_ratio(3, 5), 2), 1.67) - self.assertEqual(round(get_ratio(5, 3), 2), 1.67) - self.assertEqual(round(get_ratio(15, 9), 2), 1.67) - self.assertEqual(round(get_ratio(9, 15), 2), 1.67) + self.assertEqual(round(get_ratio(3, 5), 2), 0.6) + self.assertEqual(round(get_ratio(5, 3), 2), 0.6) + self.assertEqual(round(get_ratio(15, 9), 2), 0.6) + self.assertEqual(round(get_ratio(9, 15), 2), 0.6) def test_cosine_sim(self): self.assertEqual(cosine_sim([1, 2, 3], [1, 2, 3]), 1) @@ -272,15 +273,16 @@ def setUp(self): self.metadata_second_half = self.metadata_creator.get_metadata(MetadataCreatorInput(dataframe=self.data_second_half)) def test_size_compare(self): - self.compartor.add_comparator_type(SizeHandlerByColumn()) - - self.assertEqual(self.compartor.compare(self.metadata1, self.metadata1).distance, 0) - self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0) - self.assertEqual(self.compartor.compare(self.metadata_first_half, self.metadata_second_half).distance, 0) - self.compartor.add_settings(Settings.NO_RATIO) - self.assertEqual(self.compartor.compare(self.metadata1, self.metadata1).distance, 0) - self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0) - self.assertEqual(self.compartor.compare(self.metadata_first_half, self.metadata_second_half).distance, 0) + comparator = ComparatorByColumn() + comparator.add_comparator_type(SizeHandlerByColumn()) + res = np.sqrt(1 / 2) + self.assertEqual(comparator.compare(self.metadata1, self.metadata1).distance, res) + self.assertEqual(comparator.compare(self.metadata1, self.metadata_diff_column_names).distance, res) + self.assertEqual(comparator.compare(self.metadata_first_half, self.metadata_second_half).distance, res) + comparator.add_settings(Settings.NO_RATIO) + self.assertEqual(comparator.compare(self.metadata1, self.metadata1).distance, res) + self.assertEqual(comparator.compare(self.metadata1, self.metadata_diff_column_names).distance, res) + self.assertEqual(comparator.compare(self.metadata_first_half, self.metadata_second_half).distance, res) def test_incomplete_compare(self): self.compartor.add_comparator_type(IncompleteColumnsHandlerByColumn())