Skip to content

Commit

Permalink
Merge pull request #159 from openstates/import-no-dupe-checks-in-import
Browse files Browse the repository at this point in the history
Stop checking for duplicates in imports
  • Loading branch information
jessemortenson authored Jan 27, 2025
2 parents 7504d87 + e7168a0 commit 5907f4c
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 10 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## 6.20.14 - Jan 27, 2025

* Allow duplicate items to be imported in import via new runtime flag --allow_duplicates

## 6.20.13 - Dec 27, 2024

* Sanitize phone number for US people scrape.
Expand Down
12 changes: 9 additions & 3 deletions openstates/cli/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,11 @@ def do_import(juris: State, args: argparse.Namespace) -> dict[str, typing.Any]:
logger.info("import jurisdictions...")
report.update(juris_importer.import_directory(datadir))
logger.info("import bills...")
report.update(bill_importer.import_directory(datadir))
report.update(bill_importer.import_directory(datadir, allow_duplicates=args.allow_duplicates))
logger.info("import vote events...")
report.update(vote_event_importer.import_directory(datadir))
report.update(vote_event_importer.import_directory(datadir, allow_duplicates=args.allow_duplicates))
logger.info("import events...")
report.update(event_importer.import_directory(datadir))
report.update(event_importer.import_directory(datadir, allow_duplicates=args.allow_duplicates))
DatabaseJurisdiction.objects.filter(id=juris.jurisdiction_id).update(
latest_bill_update=datetime.datetime.utcnow()
)
Expand Down Expand Up @@ -520,6 +520,12 @@ def parse_args() -> tuple[argparse.Namespace, list[str]]:
dest="strict",
help="skip validation on save",
)
parser.add_argument(
"--allow_duplicates",
action="store_true",
dest="allow_duplicates",
help="Skip throwing a DuplicateItemError, instead all import of duplicate items",
)
parser.add_argument(
"--fastmode", action="store_true", help="use cache and turn off throttling"
)
Expand Down
16 changes: 10 additions & 6 deletions openstates/importers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def resolve_json_id(
except KeyError:
raise UnresolvedIdError("cannot resolve id: {}".format(json_id))

def import_directory(self, datadir: str) -> typing.Dict[str, typing.Dict]:
def import_directory(self, datadir: str, allow_duplicates=False) -> typing.Dict[str, typing.Dict]:
"""import a JSON directory into the database"""

def json_stream() -> typing.Iterator[_JsonDict]:
Expand All @@ -282,7 +282,7 @@ def json_stream() -> typing.Iterator[_JsonDict]:
with open(fname) as f:
yield json.load(f)

return self.import_data(json_stream())
return self.import_data(json_stream(), allow_duplicates)

def _prepare_imports(
self, dicts: typing.Iterable[_JsonDict]
Expand All @@ -309,7 +309,7 @@ def _prepare_imports(
self.duplicates[json_id] = seen_hashes[objhash]

def import_data(
self, data_items: typing.Iterable[_JsonDict]
self, data_items: typing.Iterable[_JsonDict], allow_duplicates=False
) -> typing.Dict[str, typing.Dict]:
"""import a bunch of dicts together"""
# keep counts of all actions
Expand All @@ -322,7 +322,7 @@ def import_data(
}

for json_id, data in self._prepare_imports(data_items):
obj_id, what = self.import_item(data)
obj_id, what = self.import_item(data, allow_duplicates)
if not obj_id or not what:
"Skipping data because it did not have an associated ID or type"
continue
Expand All @@ -341,7 +341,7 @@ def import_data(

return {self._type: record}

def import_item(self, data: _JsonDict) -> typing.Tuple[_ID, str]:
def import_item(self, data: _JsonDict, allow_duplicates=False) -> typing.Tuple[_ID, str]:
"""function used by import_data"""
what = "noop"

Expand Down Expand Up @@ -369,8 +369,12 @@ def import_item(self, data: _JsonDict) -> typing.Tuple[_ID, str]:

# obj existed, check if we need to do an update
if obj:
if obj.id in self.json_to_db_id.values():
# If --allow_duplicates flag is set on client CLI command
# then we ignore duplicates instead of raising an exception
if not allow_duplicates and obj.id in self.json_to_db_id.values():
raise DuplicateItemError(data, obj, related.get("sources", []))
elif allow_duplicates and obj.id in self.json_to_db_id.values():
self.logger.warning(f"Ignored a DuplicateItemError for {obj.id}")
# check base object for changes
for key, value in data.items():
if getattr(obj, key) != value:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "openstates"
version = "6.20.13"
version = "6.20.14"
description = "core infrastructure for the openstates project"
authors = ["James Turk <[email protected]>"]
license = "MIT"
Expand Down

0 comments on commit 5907f4c

Please sign in to comment.