Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NYUP 739 schema changes #2

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
# src/solr/open-square-metadata
src/dlts-epub-metadata
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,6 @@ gh repo clone NYULibraries/dlts-epub-metadata
2) Run the ingest script

```bash
cd scripts/ingest-documents
cd src/scripts/ingest-documents
deno run --allow-net --allow-read index.ts
```
73 changes: 63 additions & 10 deletions src/scripts/ingest-document/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ const solrUrl = `${solrHost}/${solrCollection}/update/json?commit=true`
// Metadata collection
const metadataCollection = 'nyupress'
// Define the directory containing the JSON files
const dir = Deno.cwd() + '/../../dlts-epub-metadata/' + metadataCollection
const dir = Deno.cwd() + '/../../dlts-epub-metadata/' + metadataCollection;

// Read the directory
// Read the directory and loop through each file
for await (const dirEntry of Deno.readDir(dir)) {
try {
const isbn = dirEntry.name
Expand All @@ -21,14 +21,71 @@ for await (const dirEntry of Deno.readDir(dir)) {
// Parse the JSON string into an object
const doc = JSON.parse(jsonStr)

delete doc.isDownloadable
delete doc.nyu_press_website_buy_the_book_url
delete doc.permanent_url
delete doc.rootUrl
const authors: string[] = doc.author.split(', ')
console.log(authors)

// create the full author object, unordered
const oa = [];
for (let i = 0; i < authors.length; i++) {
oa.push({
"contributors.bio": "",
"contributors.name": authors[i],
"contributors.nameSort": authors[i],
"contributors.order": i + 1,
"contributors.role": "author",
})
}
const final = JSON.stringify(oa)

const flatReviews = JSON.stringify([
{
"reviews.review": "",
"reviews.reviewer": "",
}
])

// add the properties missing in the schema from Supadu
doc.id = doc.identifier
doc.contributors = final;
doc.collection_code = 'oa-books'
doc.handle = doc.identifier
doc.publicationPlace = doc.coverage
doc.dateBook = doc.date
doc.descriptionHtml = doc.description_html
doc.pages = doc.format
doc.openSquareId = doc.identifier
doc.licenseAbbreviation = doc.license_abbreviation
doc.licenseIcon = doc.license_icon
doc.licenseLink = doc.license_link
doc.subjects = doc.subject
doc.titleSort = doc.title_sort
doc.pressUrl = doc.nyu_press_website_buy_the_book_url
doc.reviews = flatReviews
// required, but sometimes empty
doc.series = doc.series_names | "";

// remove the properties that are not in the schema
delete doc.author
delete doc.author_sort
delete doc.coverHref
delete doc.coverage
delete doc.date
delete doc.description_html
delete doc.format
delete doc.identifier
delete doc.isDownloadable
delete doc.license_abbreviation
delete doc.license_icon
delete doc.license_link
delete doc.nyu_press_website_buy_the_book_url
delete doc.packageUrl
delete doc.permanent_url
delete doc.rights
delete doc.rootUrl
delete doc.subject
delete doc.series_names
delete doc.thumbHref
delete doc.title_sort

// Define the headers for the POST request
const headers = new Headers()
Expand Down Expand Up @@ -63,8 +120,4 @@ for await (const dirEntry of Deno.readDir(dir)) {
console.log(err)
console.log('-'.repeat(80))
}

}



2 changes: 1 addition & 1 deletion src/solr/open-square-metadata