From e1404d5ef139af6c3c4a4783e50dcf10c0f744b8 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 7 Jan 2025 15:57:08 +0200 Subject: [PATCH 1/5] changed span.type to span.entity_type and created deprecation notices for FakerSpans --- notebooks/1_Generate_data.ipynb | 22 +++++++++++-------- .../faker_extensions/data_objects.py | 17 ++++++++++++++ .../faker_extensions/span_generator.py | 2 +- .../data_generator/presidio_data_generator.py | 10 +++++++++ .../data_generator/presidio_sentence_faker.py | 12 +++++----- 5 files changed, 48 insertions(+), 15 deletions(-) diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb index 78da83c..ffaa32e 100644 --- a/notebooks/1_Generate_data.ipynb +++ b/notebooks/1_Generate_data.ipynb @@ -114,9 +114,9 @@ "]\n", "\n", "\n", - "sentence_faker = PresidioSentenceFaker('en_US', \n", - " lower_case_ratio=0.05, \n", - " sentence_templates=sentence_templates)\n", + "sentence_faker = PresidioSentenceFaker(\n", + " \"en_US\", lower_case_ratio=0.05, sentence_templates=sentence_templates\n", + ")\n", "fake_sentence_results = sentence_faker.generate_new_fake_sentences(10)\n", "\n", "# Print the spans of the first sample\n", @@ -145,7 +145,7 @@ "source": [ "number_of_samples = 1500\n", "lower_case_ratio = 0.05\n", - "locale = 'en'\n", + "locale = \"en\"\n", "cur_time = datetime.date.today().strftime(\"%B_%d_%Y\")\n", "\n", "output_file = f\"../data/generated_size_{number_of_samples}_date_{cur_time}.json\"\n", @@ -185,7 +185,7 @@ } ], "source": [ - "sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05)" + "sentence_faker = PresidioSentenceFaker(\"en_US\", lower_case_ratio=0.05)" ] }, { @@ -432,17 +432,19 @@ "import random\n", "from faker.providers import BaseProvider\n", "\n", + "\n", "class MarsIdProvider(BaseProvider):\n", " def mars_id(self):\n", " # Generate a random row number between 1 and 50\n", " row = random.randint(1, 50)\n", " # Generate a random letter for the seat location from A-K\n", - " location = random.choice('ABCDEFGHIJK')\n", + " location = random.choice(\"ABCDEFGHIJK\")\n", " # Return the seat in the format \"row-letter\" (e.g., \"25A\")\n", " return f\"{row}{location}\"\n", "\n", + "\n", "sentence_faker.add_provider(MarsIdProvider)\n", - "# Now a new `mars_id` entity can be generated if a template has `mars_id` in it.\n" + "# Now a new `mars_id` entity can be generated if a template has `mars_id` in it." ] }, { @@ -595,7 +597,9 @@ "\n", "print(f\"Total: {sum(count_per_template_id.values())}\")\n", "print(f\"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}\")\n", - "print(f\"Median # of records per template: {np.median(list(count_per_template_id.values()))}\")\n", + "print(\n", + " f\"Median # of records per template: {np.median(list(count_per_template_id.values()))}\"\n", + ")\n", "print(f\"Std: {np.std(list(count_per_template_id.values()))}\")" ] }, @@ -650,7 +654,7 @@ "source": [ "count_per_entity = Counter()\n", "for record in fake_records:\n", - " count_per_entity.update(Counter([span.type for span in record.spans]))\n", + " count_per_entity.update(Counter([span.entity_type for span in record.spans]))\n", "\n", "count_per_entity" ] diff --git a/presidio_evaluator/data_generator/faker_extensions/data_objects.py b/presidio_evaluator/data_generator/faker_extensions/data_objects.py index ea49bf5..752ed3e 100644 --- a/presidio_evaluator/data_generator/faker_extensions/data_objects.py +++ b/presidio_evaluator/data_generator/faker_extensions/data_objects.py @@ -1,3 +1,4 @@ +import warnings from dataclasses import dataclass import dataclasses import json @@ -16,6 +17,14 @@ class FakerSpan: end: int type: str + def __post_init__(self): + warnings.warn( + "FakerSpan is deprecated and will be removed in future versions." + "Use Span instead", + category=DeprecationWarning, + stacklevel=2, + ) + def __repr__(self): return json.dumps(dataclasses.asdict(self)) @@ -31,6 +40,14 @@ class FakerSpansResult: template_id: Optional[int] = None sample_id: Optional[int] = None + def __post_init__(self): + warnings.warn( + "FakerSpansResult is deprecated and will be removed in future versions." + "Use InputSample instead", + category=DeprecationWarning, + stacklevel=2, + ) + def __str__(self): return self.fake diff --git a/presidio_evaluator/data_generator/faker_extensions/span_generator.py b/presidio_evaluator/data_generator/faker_extensions/span_generator.py index 57a138b..b6d4b7e 100644 --- a/presidio_evaluator/data_generator/faker_extensions/span_generator.py +++ b/presidio_evaluator/data_generator/faker_extensions/span_generator.py @@ -67,7 +67,7 @@ def parse( # Update span indices delta = new_len - old_len span.end_position = span.end_position + delta - span.type = formatter.strip() + span.entity_type = formatter.strip() # Update previously inserted spans since indices shifted for j in range(0, i): diff --git a/presidio_evaluator/data_generator/presidio_data_generator.py b/presidio_evaluator/data_generator/presidio_data_generator.py index cef85a4..9e1bb19 100644 --- a/presidio_evaluator/data_generator/presidio_data_generator.py +++ b/presidio_evaluator/data_generator/presidio_data_generator.py @@ -2,6 +2,7 @@ import json import random import re +import warnings from pathlib import Path from typing import List, Optional, Union, Generator @@ -67,6 +68,15 @@ def __init__( [{"value": "Ukraine", "start": 31, "end": 38, "type": "country"}, {"value": "North Kim", "start": 16, "end": 25, "type": "city"}] """ + + def __post_init__(self): + warnings.warn( + "PresidioDataGenerator is deprecated and will be removed in future versions." + "Use PresidioSentenceFaker instead", + category=DeprecationWarning, + stacklevel=2, + ) + if custom_faker and locale: raise ValueError( "If a custom faker is passed, it's expected to have its locales loaded" diff --git a/presidio_evaluator/data_generator/presidio_sentence_faker.py b/presidio_evaluator/data_generator/presidio_sentence_faker.py index abc3205..772825d 100644 --- a/presidio_evaluator/data_generator/presidio_sentence_faker.py +++ b/presidio_evaluator/data_generator/presidio_sentence_faker.py @@ -170,18 +170,20 @@ def generate_new_fake_sentences(self, num_samples: int) -> List[InputSample]: template = self._preprocess_template(template) fake_sentence_result = self._sentence_faker.parse(template, template_id) for span in fake_sentence_result.spans: - if span.type in self._entity_type_mapping.keys(): + if span.entity_type in self._entity_type_mapping.keys(): # Use the mapped entity type if exists - span.type = self._entity_type_mapping[span.type] + span.entity_type = self._entity_type_mapping[span.entity_type] else: # Otherwise, capitalize the entity type and add to the mapping print( - f"Warning: Non-mapped entity type found: {span.type}. " - f"Non-mapped entities will be mapped to {span.type.upper()} " + f"Warning: Non-mapped entity type found: {span.entity_type}. " + f"Non-mapped entities will be mapped to {span.entity_type.upper()} " f"in the output dataset. If you prefer a different mapping, " f"pass the `entity_type_mapping` argument with a mapping for this entity type." ) - self._entity_type_mapping[span.type] = span.type.upper() + self._entity_type_mapping[span.entity_type] = ( + span.entity_type.upper() + ) for key, value in self._entity_type_mapping.items(): fake_sentence_result.masked = fake_sentence_result.masked.replace( "{{%s}}" % key, "{{%s}}" % value From 7f25b305b1fd76e9441864b83f1b2b7b6c1d4673 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 7 Jan 2025 17:14:56 +0200 Subject: [PATCH 2/5] fixed deprecation warning --- .../data_generator/faker_extensions/data_objects.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/presidio_evaluator/data_generator/faker_extensions/data_objects.py b/presidio_evaluator/data_generator/faker_extensions/data_objects.py index 752ed3e..3d80a87 100644 --- a/presidio_evaluator/data_generator/faker_extensions/data_objects.py +++ b/presidio_evaluator/data_generator/faker_extensions/data_objects.py @@ -17,7 +17,7 @@ class FakerSpan: end: int type: str - def __post_init__(self): + def __new__(cls, *args, **kwargs): warnings.warn( "FakerSpan is deprecated and will be removed in future versions." "Use Span instead", @@ -25,6 +25,8 @@ def __post_init__(self): stacklevel=2, ) + return super().__new__(cls) + def __repr__(self): return json.dumps(dataclasses.asdict(self)) @@ -40,7 +42,7 @@ class FakerSpansResult: template_id: Optional[int] = None sample_id: Optional[int] = None - def __post_init__(self): + def __new__(cls, *args, **kwargs): warnings.warn( "FakerSpansResult is deprecated and will be removed in future versions." "Use InputSample instead", @@ -48,6 +50,8 @@ def __post_init__(self): stacklevel=2, ) + return super().__new__(cls) + def __str__(self): return self.fake From d518bab3088eacf6b47103d252ba1611b6129286 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 8 Jan 2025 09:19:19 +0200 Subject: [PATCH 3/5] updated version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index eca721e..41d5980 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "presidio_evaluator" -version = "0.2.1" +version = "0.2.2" description = "" authors = ["Microsoft"] readme = "README.md" From 88bf02f14004889c9b7d78e5e377f6790aad9a22 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 8 Jan 2025 09:19:29 +0200 Subject: [PATCH 4/5] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 0ea3a94..ee1372d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.0 +0.2.2 From fb1708c524fb3e5c62588af8460d8ad3fa804f60 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 8 Jan 2025 11:14:59 +0200 Subject: [PATCH 5/5] updated version and bug fixes in presidio_data_generator --- VERSION | 2 +- notebooks/1_Generate_data.ipynb | 260 +++++++++--------- .../data_generator/presidio_data_generator.py | 78 +----- pyproject.toml | 4 +- 4 files changed, 143 insertions(+), 201 deletions(-) diff --git a/VERSION b/VERSION index 0ea3a94..ee1372d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.0 +0.2.2 diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb index ffaa32e..5f60e67 100644 --- a/notebooks/1_Generate_data.ipynb +++ b/notebooks/1_Generate_data.ipynb @@ -87,15 +87,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "Sampling: 100%|██████████| 10/10 [00:00<00:00, 3959.88it/s]" + "Sampling: 100%|██████████| 10/10 [00:00<00:00, 4370.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Please send it to {{STREET_ADDRESS}}\n", - "[Span(type: address, value: the corner of Καλαμπάκα 33 and Stefan Land, char_span: [18: 60])]\n" + "I just moved to {{GPE}} from {{GPE}}\n", + "[Span(type: GPE, value: Spain, char_span: [45: 50]), Span(type: GPE, value: Valverde de Valdelacasa, char_span: [16: 39])]\n" ] }, { @@ -160,8 +160,8 @@ } }, "source": [ - "The `PresidioSentenceFaker` loads [FakeNameGenerator](https://www.fakenamegenerator.com/) data by default\n", - "to extend the set of fake values and creates a `RecordsFaker` \n", + "The `PresidioSentenceFaker` is based on the Faker library. It loads [FakeNameGenerator](https://www.fakenamegenerator.com/) data by default\n", + "to extend the set of fake values and creates a `SentenceFaker` \n", "which returns a fake person record (with multiple values) instead of one value,\n", "allowing dependencies between values belonging to the same fake person\n", "(e.g. name = Michael Smith with the email michael.smith@gmail.com).\n", @@ -277,8 +277,8 @@ " ...\n", " Formula Gray\n", " LostMillions.com.pt\n", - " Patricia G. Desrosiers\n", - " Patricia G. Desrosiers\n", + " Patricia Desrosiers\n", + " Patricia Desrosiers\n", " Patricia\n", " \n", " Ms.\n", @@ -301,8 +301,8 @@ " ...\n", " Dahlkemper's\n", " MediumTube.co.za\n", - " Debra O. Neal\n", - " Debra O. Neal\n", + " Debra Neal\n", + " Debra Neal\n", " Debra\n", " \n", " Ms.\n", @@ -325,8 +325,8 @@ " ...\n", " Quickbiz\n", " ImproveLook.com.cy\n", - " Peverell C. Racine\n", - " Peverell C. Racine\n", + " Peverell Racine\n", + " Peverell Racine\n", " \n", " Peverell\n", " \n", @@ -349,8 +349,8 @@ " ...\n", " Dubrow's Cafeteria\n", " PostTan.com.ee\n", - " Iolanda S. Tratnik\n", - " Iolanda S. Tratnik\n", + " Iolanda Tratnik\n", + " Iolanda Tratnik\n", " Iolanda\n", " \n", " Mrs.\n", @@ -378,12 +378,12 @@ "3 183 Epimenidou Street Limassol LI ... Quickbiz \n", "4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n", "\n", - " domain_name person name \\\n", - "0 MarathonDancing.gl Marie Hamanová Marie Hamanová \n", - "1 LostMillions.com.pt Patricia G. Desrosiers Patricia G. Desrosiers \n", - "2 MediumTube.co.za Debra O. Neal Debra O. Neal \n", - "3 ImproveLook.com.cy Peverell C. Racine Peverell C. Racine \n", - "4 PostTan.com.ee Iolanda S. Tratnik Iolanda S. Tratnik \n", + " domain_name person name \\\n", + "0 MarathonDancing.gl Marie Hamanová Marie Hamanová \n", + "1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n", + "2 MediumTube.co.za Debra Neal Debra Neal \n", + "3 ImproveLook.com.cy Peverell Racine Peverell Racine \n", + "4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n", "\n", " first_name_female first_name_male prefix_female prefix_male \\\n", "0 Marie Mrs. \n", @@ -541,15 +541,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 13821.21it/s]" + "Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 8316.22it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Full text: The title refers to Riddersporen 1 street in STAVANGER. It was on this street that many of the clubs where Metallica first played were situated. \"Battery is found in me\" shows that these early shows on Everardus Mountains Street were important to them. Battery is where \"lunacy finds you\" and you \"smash through the boundaries.\"\n", - "Spans: [Span(type: street_name, value: Everardus Mountains, char_span: [202: 221]), Span(type: city, value: STAVANGER, char_span: [45: 54]), Span(type: street_name, value: Riddersporen 1, char_span: [20: 34])]\n", + "Full text: I'll meet you at 323 Postbox 78\n", + " Apt. 637\n", + " Slædepatruljen Sirius\n", + " Greenlander after the concert.\n", + "Spans: [Span(type: STREET_ADDRESS, value: 323 Postbox 78\n", + " Apt. 637\n", + " Slædepatruljen Sirius\n", + " Greenlander, char_span: [17: 77])]\n", "\n" ] }, @@ -588,7 +594,7 @@ "Total: 1500\n", "Avg # of records per template: 7.142857142857143\n", "Median # of records per template: 7.0\n", - "Std: 2.6812526263406258\n" + "Std: 2.7513756608669206\n" ] } ], @@ -627,23 +633,23 @@ { "data": { "text/plain": [ - "Counter({'PERSON': 874,\n", - " 'STREET_ADDRESS': 609,\n", - " 'GPE': 442,\n", - " 'ORGANIZATION': 253,\n", - " 'CREDIT_CARD': 131,\n", - " 'PHONE_NUMBER': 117,\n", - " 'DATE_TIME': 106,\n", - " 'TITLE': 91,\n", - " 'AGE': 79,\n", - " 'NRP': 66,\n", - " 'ZIP_CODE': 42,\n", - " 'EMAIL_ADDRESS': 33,\n", - " 'DOMAIN_NAME': 30,\n", - " 'IBAN_CODE': 26,\n", - " 'IP_ADDRESS': 18,\n", - " 'US_SSN': 18,\n", - " 'US_DRIVER_LICENSE': 9})" + "Counter({'PERSON': 875,\n", + " 'STREET_ADDRESS': 647,\n", + " 'GPE': 462,\n", + " 'ORGANIZATION': 260,\n", + " 'CREDIT_CARD': 146,\n", + " 'PHONE_NUMBER': 101,\n", + " 'DATE_TIME': 96,\n", + " 'TITLE': 88,\n", + " 'AGE': 73,\n", + " 'NRP': 61,\n", + " 'EMAIL_ADDRESS': 47,\n", + " 'ZIP_CODE': 39,\n", + " 'DOMAIN_NAME': 28,\n", + " 'IBAN_CODE': 22,\n", + " 'US_SSN': 11,\n", + " 'IP_ADDRESS': 11,\n", + " 'US_DRIVER_LICENSE': 11})" ] }, "execution_count": 12, @@ -668,49 +674,41 @@ "name": "stdout", "output_type": "stream", "text": [ - "Full text: The title refers to Riddersporen 1 street in STAVANGER. It was on this street that many of the clubs where Metallica first played were situated. \"Battery is found in me\" shows that these early shows on Everardus Mountains Street were important to them. Battery is where \"lunacy finds you\" and you \"smash through the boundaries.\"\n", - "Spans: [Span(type: street_name, value: Everardus Mountains, char_span: [202: 221]), Span(type: city, value: STAVANGER, char_span: [45: 54]), Span(type: street_name, value: Riddersporen 1, char_span: [20: 34])]\n", + "Full text: I'll meet you at 323 Postbox 78\n", + " Apt. 637\n", + " Slædepatruljen Sirius\n", + " Greenlander after the concert.\n", + "Spans: [Span(type: STREET_ADDRESS, value: 323 Postbox 78\n", + " Apt. 637\n", + " Slædepatruljen Sirius\n", + " Greenlander, char_span: [17: 77])]\n", "\n", - "Full text: The Ilta T Ryhänen version recorded for Weatherford International Inc became the first celebrity recording by a classical musician to sell one million copies. The song was awarded the seventh gold disc ever granted.\n", - "Spans: [Span(type: organization, value: Weatherford International Inc, char_span: [40: 69]), Span(type: person, value: Ilta T Ryhänen, char_span: [4: 18])]\n", + "Full text: The Adomos SA Orchestra was founded in 2014. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Portugal\n", + "Spans: [Span(type: GPE, value: Portugal, char_span: [149: 157]), Span(type: DATE_TIME, value: 2014, char_span: [39: 43]), Span(type: ORGANIZATION, value: Adomos SA, char_span: [4: 13])]\n", "\n", - "Full text: We'll meet Monday at JAPAN PULP AND PAPER COMPANY LIMITED, 5931 84 Cassinia Street, GUNDAGAI\n", - "Spans: [Span(type: city, value: GUNDAGAI, char_span: [84: 92]), Span(type: street_name, value: 84 Cassinia Street, char_span: [64: 82]), Span(type: building_number, value: 5931, char_span: [59: 63]), Span(type: organization, value: JAPAN PULP AND PAPER COMPANY LIMITED, char_span: [21: 57]), Span(type: day_of_week, value: Monday, char_span: [11: 17])]\n", + "Full text: It's like that since 4/26/1954\n", + "Spans: [Span(type: DATE_TIME, value: 4/26/1954, char_span: [21: 30])]\n", "\n", - "Full text: Can someone call me on 0377 7151585? I have some questions about opening an account.\n", - "Spans: [Span(type: phone_number, value: 0377 7151585, char_span: [23: 35])]\n", + "Full text: One of the most depressing songs on the list. He's injured from the waist down from New Zealand, but Rinoka just has to get laid. Don't go to town, Lisa!\n", + "Spans: [Span(type: PERSON, value: Lisa, char_span: [148: 152]), Span(type: PERSON, value: Rinoka, char_span: [101: 107]), Span(type: GPE, value: New Zealand, char_span: [84: 95])]\n", "\n", - "Full text: Leena R Filppula\\nTelephone and Data Systems Inc.\\nServidão Fernando Albrecht 673 Szemere Radial\n", - " Suite 538\n", - " Joinville\n", - " Brazil 27518\\n032 627 37 30 office\\n(07700)331659 fax\\n+41 47 717 21 68 mobile\\n\n", - "Spans: [Span(type: phone_number, value: +41 47 717 21 68, char_span: [175: 191]), Span(type: phone_number, value: (07700)331659, char_span: [156: 169]), Span(type: phone_number, value: 032 627 37 30, char_span: [134: 147]), Span(type: address, value: Servidão Fernando Albrecht 673 Szemere Radial\n", - " Suite 538\n", - " Joinville\n", - " Brazil 27518, char_span: [51: 132]), Span(type: organization, value: Telephone and Data Systems Inc., char_span: [18: 49]), Span(type: name, value: Leena R Filppula, char_span: [0: 16])]\n", + "Full text: Celebrating its 10th year in Maniitsoq, Marshall, Hernandez and Simpson is a 501(c)3 that invites songwriters from around the world to Tomášhaven to share the universal language of music in collaborations designed to bridge cultures, build friendships and cultivate peace.\n", + "Spans: [Span(type: GPE, value: Tomášhaven, char_span: [135: 145]), Span(type: ORGANIZATION, value: Marshall, Hernandez and Simpson, char_span: [40: 71]), Span(type: GPE, value: Maniitsoq, char_span: [29: 38])]\n", "\n", - "Full text: Bot: Where would you like this to be sent to? User: 11129 Rua Forno 76\n", - " Suite 599\n", - " Quinta do Passadouro de Cima\n", - " Portugal 66984\n", - "Spans: [Span(type: address, value: 11129 Rua Forno 76\n", - " Suite 599\n", - " Quinta do Passadouro de Cima\n", - " Portugal 66984, char_span: [52: 127])]\n", + "Full text: I would like to remove my kid Milada from the will. How do I do that?\n", + "Spans: [Span(type: PERSON, value: Milada, char_span: [30: 36])]\n", "\n", - "Full text: One of the most depressing songs on the list. He's injured from the waist down from Spain, but Alexander just has to get laid. Don't go to town, Christopher!\n", - "Spans: [Span(type: first_name, value: Christopher, char_span: [145: 156]), Span(type: first_name, value: Alexander, char_span: [95: 104]), Span(type: country, value: Spain, char_span: [84: 89])]\n", + "Full text: A great song made even greater by a mandolin coda (not by Hugolina Cazares).\n", + "Spans: [Span(type: PERSON, value: Hugolina Cazares, char_span: [58: 74])]\n", "\n", - "Full text: Our offices are located at Romina and Müürivahe 27\n", - "Spans: [Span(type: address, value: Romina and Müürivahe 27, char_span: [27: 50])]\n", + "Full text: Who's coming to New Zealand with me?\n", + "Spans: [Span(type: GPE, value: New Zealand, char_span: [16: 27])]\n", "\n", - "Full text: Meet me at Unit 8161 Box 6817\n", - "DPO AE 26241\n", - "Spans: [Span(type: address, value: Unit 8161 Box 6817\n", - "DPO AE 26241, char_span: [11: 42])]\n", + "Full text: For my take on Ms. Portič, see Guilty Pleasures: 5 Musicians Of The 70s You're Supposed To Hate (But Secretly Love)\n", + "Spans: [Span(type: PERSON, value: Portič, char_span: [19: 25]), Span(type: TITLE, value: Ms., char_span: [15: 18])]\n", "\n", - "Full text: How do I open my credit card statement?\n", - "Spans: []\n", + "Full text: Blink-182 pay tribute here to the Switzerland. Producer Jiří Lukášek explained to Fuse TV: \"We all liked the idea of writing a song about our state, where we live and love. To me it's the most beautiful place in the world, this song was us giving credit to how lucky we are to have lived here and grown up here, raising families here, the whole thing.\"\n", + "Spans: [Span(type: PERSON, value: Jiří Lukášek, char_span: [56: 68]), Span(type: GPE, value: Switzerland, char_span: [34: 45])]\n", "\n" ] } @@ -753,7 +751,7 @@ { "data": { "text/plain": [ - "'../data/generated_size_1500_date_January_06_2025.json'" + "'../data/generated_size_1500_date_January_08_2025.json'" ] }, "execution_count": 15, @@ -804,7 +802,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1500/1500 [00:03<00:00, 386.94it/s]\n" + "100%|██████████| 1500/1500 [00:04<00:00, 320.23it/s]\n" ] }, { @@ -839,92 +837,92 @@ " \n", " \n", " 0\n", - " The\n", - " DET\n", - " DT\n", - " 110\n", + " I\n", + " PRON\n", + " PRP\n", + " 46\n", " O\n", " 0\n", " \n", " \n", " 1\n", - " title\n", - " NOUN\n", - " NN\n", - " 110\n", + " 'll\n", + " AUX\n", + " MD\n", + " 46\n", " O\n", " 0\n", " \n", " \n", " 2\n", - " refers\n", + " meet\n", " VERB\n", - " VBZ\n", - " 110\n", + " VB\n", + " 46\n", " O\n", " 0\n", " \n", " \n", " 3\n", - " to\n", - " ADP\n", - " IN\n", - " 110\n", + " you\n", + " PRON\n", + " PRP\n", + " 46\n", " O\n", " 0\n", " \n", " \n", " 4\n", - " Riddersporen\n", - " PROPN\n", - " NNP\n", - " 110\n", - " B-street_name\n", + " at\n", + " ADP\n", + " IN\n", + " 46\n", + " O\n", " 0\n", " \n", " \n", " 5\n", - " 1\n", + " 323\n", " NUM\n", " CD\n", - " 110\n", - " I-street_name\n", + " 46\n", + " B-STREET_ADDRESS\n", " 0\n", " \n", " \n", " 6\n", - " street\n", - " NOUN\n", - " NN\n", - " 110\n", - " O\n", + " Postbox\n", + " PROPN\n", + " NNP\n", + " 46\n", + " I-STREET_ADDRESS\n", " 0\n", " \n", " \n", " 7\n", - " in\n", - " ADP\n", - " IN\n", - " 110\n", - " O\n", + " 78\n", + " NUM\n", + " CD\n", + " 46\n", + " I-STREET_ADDRESS\n", " 0\n", " \n", " \n", " 8\n", - " STAVANGER\n", - " PROPN\n", - " NNP\n", - " 110\n", - " B-city\n", + " \\n\n", + " SPACE\n", + " _SP\n", + " 46\n", + " I-STREET_ADDRESS\n", " 0\n", " \n", " \n", " 9\n", - " .\n", - " PUNCT\n", - " .\n", - " 110\n", - " O\n", + " Apt\n", + " PROPN\n", + " NNP\n", + " 46\n", + " I-STREET_ADDRESS\n", " 0\n", " \n", " \n", @@ -932,17 +930,17 @@ "" ], "text/plain": [ - " text pos tag template_id label sentence\n", - "0 The DET DT 110 O 0\n", - "1 title NOUN NN 110 O 0\n", - "2 refers VERB VBZ 110 O 0\n", - "3 to ADP IN 110 O 0\n", - "4 Riddersporen PROPN NNP 110 B-street_name 0\n", - "5 1 NUM CD 110 I-street_name 0\n", - "6 street NOUN NN 110 O 0\n", - "7 in ADP IN 110 O 0\n", - "8 STAVANGER PROPN NNP 110 B-city 0\n", - "9 . PUNCT . 110 O 0" + " text pos tag template_id label sentence\n", + "0 I PRON PRP 46 O 0\n", + "1 'll AUX MD 46 O 0\n", + "2 meet VERB VB 46 O 0\n", + "3 you PRON PRP 46 O 0\n", + "4 at ADP IN 46 O 0\n", + "5 323 NUM CD 46 B-STREET_ADDRESS 0\n", + "6 Postbox PROPN NNP 46 I-STREET_ADDRESS 0\n", + "7 78 NUM CD 46 I-STREET_ADDRESS 0\n", + "8 \\n SPACE _SP 46 I-STREET_ADDRESS 0\n", + "9 Apt PROPN NNP 46 I-STREET_ADDRESS 0" ] }, "execution_count": 16, @@ -969,7 +967,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "CoNLL2003 dataset structure output location: ../data/generated_size_1500_date_January_06_2025.tsv\n" + "CoNLL2003 dataset structure output location: ../data/generated_size_1500_date_January_08_2025.tsv\n" ] } ], diff --git a/presidio_evaluator/data_generator/presidio_data_generator.py b/presidio_evaluator/data_generator/presidio_data_generator.py index 9e1bb19..f555bff 100644 --- a/presidio_evaluator/data_generator/presidio_data_generator.py +++ b/presidio_evaluator/data_generator/presidio_data_generator.py @@ -1,9 +1,6 @@ -import dataclasses -import json import random import re import warnings -from pathlib import Path from typing import List, Optional, Union, Generator import numpy as np @@ -14,16 +11,11 @@ from tqdm import tqdm from presidio_evaluator.data_generator.faker_extensions import ( - FakerSpansResult, - NationalityProvider, - OrganizationProvider, - UsDriverLicenseProvider, - IpAddressProvider, - AddressProviderNew, SpanGenerator, - RecordsFaker, - PhoneNumberProviderNew, - AgeProvider, +) + +from presidio_evaluator.data_generator.faker_extensions.data_objects import ( + FakerSpansResult, ) @@ -44,8 +36,6 @@ def __init__( :example: - >>>from presidio_evaluator.data_generator import PresidioDataGenerator - >>>sentence_templates = [ >>> "My name is {{name}}", >>> "Please send it to {{address}}", @@ -69,13 +59,12 @@ def __init__( """ - def __post_init__(self): - warnings.warn( - "PresidioDataGenerator is deprecated and will be removed in future versions." - "Use PresidioSentenceFaker instead", - category=DeprecationWarning, - stacklevel=2, - ) + warnings.warn( + "PresidioDataGenerator is deprecated and will be removed in future versions." + "Use PresidioSentenceFaker instead", + category=DeprecationWarning, + stacklevel=2, + ) if custom_faker and locale: raise ValueError( @@ -290,50 +279,3 @@ def name_gendered(row): fake_data = pd.concat([fake_data, genderized], axis="columns") return fake_data - - -if __name__ == "__main__": - PresidioDataGenerator.seed(42) - - template_file_path = Path(Path(__file__).parent, "raw_data", "templates.txt") - - # Read FakeNameGenerator data - fake_data_df = pd.read_csv( - Path(Path(__file__).parent, "raw_data", "FakeNameGenerator.com_3000.csv") - ) - # Convert column names to lowercase to match patterns - fake_data_df = PresidioDataGenerator.update_fake_name_generator_df(fake_data_df) - - # Create a RecordsFaker (Faker object which prefers samples multiple objects from one record) - faker = RecordsFaker(records=fake_data_df, local="en_US") - faker.add_provider(IpAddressProvider) - faker.add_provider(NationalityProvider) - faker.add_provider(OrganizationProvider) - faker.add_provider(UsDriverLicenseProvider) - faker.add_provider(AgeProvider) - faker.add_provider(AddressProviderNew) # More address formats than Faker - faker.add_provider(PhoneNumberProviderNew) # More phone number formats than Faker - - # Create Presidio Data Generator - data_generator = PresidioDataGenerator(custom_faker=faker, lower_case_ratio=0.05) - data_generator.add_provider_alias(provider_name="name", new_name="person") - data_generator.add_provider_alias( - provider_name="credit_card_number", new_name="credit_card" - ) - data_generator.add_provider_alias( - provider_name="date_of_birth", new_name="birthday" - ) - - sentence_templates = PresidioDataGenerator.read_template_file(template_file_path) - fake_patterns = data_generator.generate_fake_data( - templates=sentence_templates, n_samples=10000 - ) - - # save to json - output_file = Path( - Path(__file__).parent.parent.parent, "data", "presidio_data_generator_data.json" - ) - - to_json = [dataclasses.asdict(pattern) for pattern in fake_patterns] - with open("{}".format(output_file), "w+", encoding="utf-8") as f: - json.dump(to_json, f, ensure_ascii=False, indent=2) diff --git a/pyproject.toml b/pyproject.toml index eca721e..2064a51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "presidio_evaluator" -version = "0.2.1" +version = "0.2.2" description = "" authors = ["Microsoft"] readme = "README.md" @@ -47,6 +47,8 @@ build-backend = "poetry.core.masonry.api" [tool.ruff] line-length = 88 exclude = [".git", "__pycache__", "build", "dist", "tests"] + +[tool.ruff.lint] ignore = ["E203", "D100", "D202"] [tool.pytest.ini_options]