diff --git a/.gitignore b/.gitignore index 40402a5..a7bf151 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,4 @@ dmypy.json .vscode/** requirements_temp.txt +data \ No newline at end of file diff --git a/kickstarter/data_extraction/__init__.py b/kickstarter/data_extraction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kickstarter/data_extraction/preprocess.py b/kickstarter/data_extraction/preprocess.py new file mode 100644 index 0000000..e7b9151 --- /dev/null +++ b/kickstarter/data_extraction/preprocess.py @@ -0,0 +1,52 @@ +import json +from typing import Dict, List + +from kickstarter.models import CategoryModel, CreatorModel, ProjectModel + + +def process_json( + filename: str, + projects: Dict[int, ProjectModel], + categories: Dict[int, CategoryModel], + creators: Dict[int, CreatorModel], +): + with open( + filename, + "r", + encoding="utf-8", + ) as file: + filetype = file.read(1) + file.seek(0) + if filetype == "[": + jsondata: List[dict] = json.load(file) + for proyects_dict_wrapper in jsondata: + projects_dict_list: List[Dict] = proyects_dict_wrapper["projects"] + for project_dict in projects_dict_list: + process_new_project(project_dict, projects, categories, creators) + elif filetype == "{": + for line in file: + project_dict = json.loads(line)["data"] + process_new_project(project_dict, projects, categories, creators) + else: + raise Exception("Invalid file format") + + +def process_new_project( + project_dict: Dict, + projects: Dict[int, ProjectModel], + categories: Dict[int, CategoryModel], + creators: Dict[int, CreatorModel], +): + project = ProjectModel(**project_dict) + category = CategoryModel(**project_dict["category"]) + creator = CreatorModel(**project_dict["creator"]) + + if ( + project.id in projects + and projects[project.id].state_changed_at > project.state_changed_at + ): + return + + projects[project.id] = project + categories[category.id] = category + creators[creator.id] = creator