Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: push kb #831

Merged
merged 12 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 69 additions & 40 deletions service/grails-app/services/org/olf/PackageIngestService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import static org.springframework.transaction.annotation.Propagation.REQUIRES_NE

import java.util.concurrent.TimeUnit

import org.olf.general.Org

import org.olf.general.StringUtils
import org.olf.general.IngestException

Expand Down Expand Up @@ -135,14 +137,11 @@ class PackageIngestService implements DataBinder {
// ENSURE MDC title is set as early as possible
MDC.put('title', StringUtils.truncate(pc.title.toString()))

// log.debug("Try to resolve ${pc}")

try {
PackageContentItem.withNewSession { tsess ->
PackageContentItem.withNewTransaction { status ->
// Delegate out to TitleIngestService so that any shared steps can move there.
Map titleIngestResult = titleIngestService.upsertTitle(pc, kb, trustedSourceTI)

// titleIngestResult.titleInstanceId will be non-null IFF TitleIngestService managed to find a title with that Id.
if ( titleIngestResult.titleInstanceId != null ) {
// Pass off to new hierarchy method (?)
Expand Down Expand Up @@ -243,14 +242,13 @@ class PackageIngestService implements DataBinder {
}
}

/*
* Separate out the create or lookup pkg code, so that it can
* be used both by the ingest service (via upsert pkg), as well
* as the pushKBService (directly)
*
* This method ALSO provides update information for packages.
/*
* We split out the lookup vs the update vs the create code for packages,
* as there is potentially differences in behaviour between pushKB and harvest.
*/
public Pkg lookupOrCreatePkg(PackageSchema package_data) {

// This WILL NOT update or create a package from the data.
public Pkg lookupPkg(PackageSchema package_data) {
Pkg pkg = null

// header.packageSlug contains the package maintainers authoritative identifier for this package.
Expand All @@ -266,8 +264,16 @@ class PackageIngestService implements DataBinder {
}
}

def vendor = null
if ( ( package_data.header?.packageProvider?.name != null ) && ( package_data.header?.packageProvider?.name.trim().length() > 0 ) ) {
return pkg;
}

public Org getVendorFromPackageData(PackageSchema package_data) {
Org vendor = null;

if (
package_data.header?.packageProvider?.name != null &&
package_data.header?.packageProvider?.name.trim().length() > 0
) {
log.debug("Package contains provider information: ${package_data.header?.packageProvider?.name} -- trying to match to an existing organisation.")
vendor = dependentModuleProxyService.coordinateOrg(package_data.header?.packageProvider?.name)
// reference has been removed at the request of the UI team
Expand All @@ -277,18 +283,66 @@ class PackageIngestService implements DataBinder {
log.warn('Package ingest - no provider information present')
}

return vendor;
}

public Pkg lookupPkgAndUpdate(PackageSchema package_data) {
Pkg pkg = lookupPkg(package_data)
Org vendor = getVendorFromPackageData(package_data)
// Do update step but NOT create step
if (pkg != null) {
pkg.sourceDataUpdated = package_data.header.sourceDataUpdated

if (package_data.header.lifecycleStatus) {
pkg.lifecycleStatusFromString = package_data.header.lifecycleStatus
}

if (package_data.header.availabilityScope) {
pkg.availabilityScopeFromString = package_data.header.availabilityScope
}

pkg.vendor = vendor
pkg.description = package_data.header.description
pkg.name = package_data.header.packageName
pkg.save(failOnError:true)

// Call separate methods for updating collections for code cleanliness
// These methods are responsible for their own saves
updateContentTypes(pkg.id, package_data)
updateAlternateNames(pkg.id, package_data)
updateAvailabilityConstraints(pkg.id, package_data)
updatePackageDescriptionUrls(pkg.id, package_data)
}

return pkg;
}


/*
* Separate out the create or lookup pkg code, so that it can
* be used both by the ingest service (via upsert pkg), as well
* as the pushKBService (directly)
*
* This method ALSO updates information for packages.
*/
public Pkg lookupOrCreatePkg(PackageSchema package_data) {
// This takes care of any updates
Pkg pkg = lookupPkgAndUpdate(package_data);

// If pkg is null, then we can safely create a new one
if ( pkg == null ) {
Org vendor = getVendorFromPackageData(package_data)
pkg = new Pkg(
name: package_data.header.packageName,
source: package_data.header.packageSource,
name: package_data.header.packageName,
source: package_data.header.packageSource,
reference: package_data.header.packageSlug,
description: package_data.header.description,
sourceDataCreated: package_data.header.sourceDataCreated,
sourceDataUpdated: package_data.header.sourceDataUpdated,
sourceTitleCount: package_data.header.sourceTitleCount,
availabilityScope: ( package_data.header.availabilityScope != null ? Pkg.lookupOrCreateAvailabilityScope(package_data.header.availabilityScope) : null ),
lifecycleStatus: Pkg.lookupOrCreateLifecycleStatus(package_data.header.lifecycleStatus != null ? package_data.header.lifecycleStatus : 'Unknown'),
vendor: vendor,
vendor: vendor,
).save(flush:true, failOnError:true)

(package_data?.header?.contentTypes ?: []).each {
Expand All @@ -308,28 +362,6 @@ class PackageIngestService implements DataBinder {
}

pkg.save(failOnError: true)
} else {
pkg.sourceDataUpdated = package_data.header.sourceDataUpdated

if (package_data.header.lifecycleStatus) {
pkg.lifecycleStatusFromString = package_data.header.lifecycleStatus
}

if (package_data.header.availabilityScope) {
pkg.availabilityScopeFromString = package_data.header.availabilityScope
}

pkg.vendor = vendor
pkg.description = package_data.header.description
pkg.name = package_data.header.packageName
pkg.save(failOnError:true)

// Call separate methods for updating collections for code cleanliness
// These methods are responsible for their own saves
updateContentTypes(pkg.id, package_data)
updateAlternateNames(pkg.id, package_data)
updateAvailabilityConstraints(pkg.id, package_data)
updatePackageDescriptionUrls(pkg.id, package_data)
}

return pkg;
Expand Down Expand Up @@ -475,7 +507,6 @@ class PackageIngestService implements DataBinder {
}

Platform platform = Platform.resolve(platform_url_to_use, pc.platformName)
// log.debug("Platform: ${platform}")

if ( platform == null && PROXY_MISSING_PLATFORM ) {
platform = Platform.resolve('http://localhost.localdomain', 'This platform entry is used for error cases')
Expand Down Expand Up @@ -528,8 +559,6 @@ class PackageIngestService implements DataBinder {
pciStatus: 'none' // This should be 'none', 'updated' or 'new'
]

// log.debug("platform ${pc.platformUrl} ${pc.platformName} (item URL is ${pc.url})")

// lets try and work out the platform for the item
Platform platform = lookupOrCreatePlatform(pc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class TitleIngestService implements DataBinder {
TitleInstance.withNewTransaction {
if (!kb) {
// This KB is created without a Type... so if it was trusted for TI data then it'd fail secondary enrichment
kb = new RemoteKB( name:remotekbname,
kb = new RemoteKB( name:remotekbname,
rectype: RemoteKB.RECTYPE_TITLE,
active: Boolean.TRUE,
readonly:readOnly,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,6 @@ order by pj.dateCreated
org.slf4j.MDC.clear()
org.slf4j.MDC.setContextMap( jobId: '' + jid, tenantId: '' + tid, 'tenant': '' + tenantName)
JobContext.current.set(new JobContext( jobId: jid, tenantId: tid ))
//log.debug("LOGDEBUG TID: ${tId}")
Tenants.withId(tid) {
beginJob(jid)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ class PushKBService implements DataBinder {
updatedAccessEnd: 0,
]
KBIngressType ingressType = kbManagementBean.ingressType

if (ingressType == KBIngressType.PushKB) {
try {
pcis.each { Map record ->
Expand All @@ -133,47 +132,64 @@ class PushKBService implements DataBinder {
if (utilityService.checkValidBinding(pc)) {
try {
Pkg pkg = null;
Pkg.withNewTransaction { status ->
// TODO this will allow the PCI data to update the PKG record... do we want this?
pkg = packageIngestService.lookupOrCreatePackageFromTitle(pc);
log.debug("LOGGING PACKAGE OBTAINED FROM PCI: ${pkg}")

Map titleIngestResult = titleIngestService.upsertTitleDirect(pc)
log.debug("LOGGING titleIngestResult: ${titleIngestResult}")

if ( titleIngestResult.titleInstanceId != null ) {
Map hierarchyResult = packageIngestService.lookupOrCreateTitleHierarchy(
titleIngestResult.titleInstanceId,
pkg.id,
true,
pc,
result.updateTime,
result.titleCount // FIXME not sure about this
)

PackageContentItem pci = PackageContentItem.get(hierarchyResult.pciId)
packageIngestService.hierarchyResultMapLogic(hierarchyResult, result, pci)

/* TODO figure out if use of removedTimestamp
* should be something harvest also needs to do directly
* And whether we should be doing it after all the above
* or before.
*/
if (pc.removedTimestamp) {
try {
log.debug("Removal candidate: pci.id #${pci.id} (Last seen ${pci.lastSeenTimestamp}, thisUpdate ${result.updateTime}) -- Set removed")
pci.removedTimestamp = pc.removedTimestamp
pci.save(failOnError:true)
} catch ( Exception e ) {
log.error("Problem removing ${pci} in package load", e)
Pkg.withSession { currentSess ->
Pkg.withTransaction {
Pkg.withNewSession { newSess ->
Pkg.withTransaction {
// TODO this will allow the PCI data to update the PKG record... do we want this?

pkg = packageIngestService.lookupOrCreatePackageFromTitle(pc);
}
newSess.clear()
}
}
}

TitleInstance.withSession { currentSess ->
TitleInstance.withTransaction {
TitleInstance.withNewSession { newSess ->
TitleInstance.withTransaction {
Map titleIngestResult = titleIngestService.upsertTitleDirect(pc)

if ( titleIngestResult.titleInstanceId != null ) {

Map hierarchyResult = packageIngestService.lookupOrCreateTitleHierarchy(
titleIngestResult.titleInstanceId,
pkg.id,
true,
pc,
result.updateTime,
result.titleCount // Not totally sure this is valuable here
)

PackageContentItem pci = PackageContentItem.get(hierarchyResult.pciId)
packageIngestService.hierarchyResultMapLogic(hierarchyResult, result, pci)

/* TODO figure out if use of removedTimestamp
* should be something harvest also needs to do directly
* And whether we should be doing it after all the above
* or before.
*/
if (pc.removedTimestamp) {
try {
log.debug("Removal candidate: pci.id #${pci.id} (Last seen ${pci.lastSeenTimestamp}, thisUpdate ${result.updateTime}) -- Set removed")
pci.removedTimestamp = pc.removedTimestamp
pci.save(failOnError:true)
} catch ( Exception e ) {
log.error("Problem removing ${pci} in package load", e)
}
result.removedTitles++
}
} else {
String message = "Skipping \"${pc.title}\". Unable to resolve title from ${pc.title} with identifiers ${pc.instanceIdentifiers}"
log.error(message)
}
}
result.removedTitles++
newSess.clear()
}
} else {
String message = "Skipping \"${pc.title}\". Unable to resolve title from ${pc.title} with identifiers ${pc.instanceIdentifiers}"
log.error(message)
}
}

} catch ( IngestException ie ) {
// When we've caught an ingest exception, should have helpful error log message
String message = "Skipping \"${pc.title}\": ${ie.message}"
Expand All @@ -196,7 +212,7 @@ class PushKBService implements DataBinder {
} */
}

long finishedTime = (System.currentTimeMillis()-result.startTime)/1000
long finishedTime = (System.currentTimeMillis()-result.startTime) // Don't divide by 1000 here
result.success = true

// TODO Logging may need tweaking between pushKB and harvest
Expand Down
40 changes: 40 additions & 0 deletions service/src/main/okapi/tenant/sample_data/test1-data.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// DEV NOTE -- This is what will be used for the default rancher-desktop-db/dc environments
// For vagrant development (which I think is deprecated now) there may need to be a different file
// to avoid making changes to diku-data or _data.
log.info "Running test1-data tenant sample data file"

import org.olf.kb.RemoteKB

/* RemoteKB.findByName('GOKb_TEST') ?: (new RemoteKB(
name:'GOKb_TEST',
type:'org.olf.kb.adapters.GOKbOAIAdapter',
uri:'https://gokbt.gbv.de/gokb/oai/index',
fullPrefix:'gokb',
rectype: RemoteKB.RECTYPE_PACKAGE,
active:Boolean.TRUE,
supportsHarvesting:true,
activationEnabled:false,
//cursor: "2022-08-09T19:34:42Z"
).save(failOnError:true)) */

RemoteKB.findByName('GOKb') ?: (new RemoteKB(
name:'GOKb',
type:'org.olf.kb.adapters.GOKbOAIAdapter',
uri:'https://gokb.org/gokb/oai/index',
fullPrefix:'gokb',
rectype: RemoteKB.RECTYPE_PACKAGE,
active:Boolean.TRUE,
supportsHarvesting:true,
activationEnabled:false
).save(failOnError:true))

/* RemoteKB.findByName('DEBUG') ?: (new RemoteKB(
name:'DEBUG',
type:'org.olf.kb.adapters.DebugGoKbAdapter',
// uri can be used to directly force a package from the resources folder
// uri: 'src/integration-test/resources/DebugGoKbAdapter/borked_ids.xml'
rectype: RemoteKB.RECTYPE_PACKAGE,
active:Boolean.TRUE,
supportsHarvesting:true,
activationEnabled:false
).save(failOnError:true)) */