From e0a326eb5b4982be728ce7e1c167d7acedbb7ac6 Mon Sep 17 00:00:00 2001 From: Jonathan Ahrens <18541228+teikmeout@users.noreply.github.com> Date: Tue, 14 May 2024 11:40:41 -0700 Subject: [PATCH 1/4] Add deno command --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4d37b3c..ba94971 100644 --- a/README.md +++ b/README.md @@ -28,5 +28,6 @@ gh repo clone NYULibraries/dlts-epub-metadata 2) Run the ingest script ```bash -cd scripts/ingest-documents +cd src/scripts/ingest-documents +deno run --allow-net --allow-read index.ts ``` From e9b94397f77dce89fadb6563a427ab142564c303 Mon Sep 17 00:00:00 2001 From: Jonathan Ahrens <18541228+teikmeout@users.noreply.github.com> Date: Tue, 14 May 2024 11:41:08 -0700 Subject: [PATCH 2/4] Add/Remove properties according to new schema --- src/scripts/ingest-document/index.ts | 73 ++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/src/scripts/ingest-document/index.ts b/src/scripts/ingest-document/index.ts index 1f2ac02..e9f393a 100644 --- a/src/scripts/ingest-document/index.ts +++ b/src/scripts/ingest-document/index.ts @@ -10,9 +10,9 @@ const solrUrl = `${solrHost}/${solrCollection}/update/json?commit=true` // Metadata collection const metadataCollection = 'nyupress' // Define the directory containing the JSON files -const dir = Deno.cwd() + '/../../dlts-epub-metadata/' + metadataCollection +const dir = Deno.cwd() + '/../../dlts-epub-metadata/' + metadataCollection; -// Read the directory +// Read the directory and loop through each file for await (const dirEntry of Deno.readDir(dir)) { try { const isbn = dirEntry.name @@ -21,14 +21,71 @@ for await (const dirEntry of Deno.readDir(dir)) { // Parse the JSON string into an object const doc = JSON.parse(jsonStr) - delete doc.isDownloadable - delete doc.nyu_press_website_buy_the_book_url - delete doc.permanent_url - delete doc.rootUrl + const authors: string[] = doc.author.split(', ') + console.log(authors) + // create the full author object, unordered + const oa = []; + for (let i = 0; i < authors.length; i++) { + oa.push({ + "contributors.bio": "", + "contributors.name": authors[i], + "contributors.nameSort": authors[i], + "contributors.order": i + 1, + "contributors.role": "author", + }) + } + const final = JSON.stringify(oa) + + const flatReviews = JSON.stringify([ + { + "reviews.review": "", + "reviews.reviewer": "", + } + ]) + + // add the properties missing in the schema from Supadu doc.id = doc.identifier + doc.contributors = final; doc.collection_code = 'oa-books' doc.handle = doc.identifier + doc.publicationPlace = doc.coverage + doc.dateBook = doc.date + doc.descriptionHtml = doc.description_html + doc.pages = doc.format + doc.openSquareId = doc.identifier + doc.licenseAbbreviation = doc.license_abbreviation + doc.licenseIcon = doc.license_icon + doc.licenseLink = doc.license_link + doc.subjects = doc.subject + doc.titleSort = doc.title_sort + doc.pressUrl = doc.nyu_press_website_buy_the_book_url + doc.reviews = flatReviews + // required, but sometimes empty + doc.series = doc.series_names | ""; + + // remove the properties that are not in the schema + delete doc.author + delete doc.author_sort + delete doc.coverHref + delete doc.coverage + delete doc.date + delete doc.description_html + delete doc.format + delete doc.identifier + delete doc.isDownloadable + delete doc.license_abbreviation + delete doc.license_icon + delete doc.license_link + delete doc.nyu_press_website_buy_the_book_url + delete doc.packageUrl + delete doc.permanent_url + delete doc.rights + delete doc.rootUrl + delete doc.subject + delete doc.series_names + delete doc.thumbHref + delete doc.title_sort // Define the headers for the POST request const headers = new Headers() @@ -63,8 +120,4 @@ for await (const dirEntry of Deno.readDir(dir)) { console.log(err) console.log('-'.repeat(80)) } - } - - - From eec96ebcbc56a89ca6d4fce9bdbd70f439effd85 Mon Sep 17 00:00:00 2001 From: Jonathan Ahrens <18541228+teikmeout@users.noreply.github.com> Date: Tue, 14 May 2024 11:41:32 -0700 Subject: [PATCH 3/4] Change the branch for solr config to PR branch --- src/solr/open-square-metadata | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/solr/open-square-metadata b/src/solr/open-square-metadata index c3de2a8..474f260 160000 --- a/src/solr/open-square-metadata +++ b/src/solr/open-square-metadata @@ -1 +1 @@ -Subproject commit c3de2a8b4b5a08f987ca1c95d5a103208a5e6a0f +Subproject commit 474f26049830f7d7a596cf96643d3ec58f393733 From f77840f4204ea5fe2787c15dd1cb3c12c64f8d2e Mon Sep 17 00:00:00 2001 From: Jonathan Ahrens <18541228+teikmeout@users.noreply.github.com> Date: Tue, 14 May 2024 11:45:51 -0700 Subject: [PATCH 4/4] Add dlts-epub-metadata to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8fabaeb..a70cb4b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ # src/solr/open-square-metadata +src/dlts-epub-metadata