From db55fb8eab3ed40944266ab152e9d11a91c0c19b Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Tue, 17 Jan 2023 15:36:03 +0100 Subject: [PATCH 01/11] Initial reconciliation setup and TODOs (#1606) A stripped-down version of the lobid-gnd implementation --- web/app/controllers/resources/Reconcile.java | 214 +++++++++++++++++++ web/conf/resources.routes | 4 + 2 files changed, 218 insertions(+) create mode 100644 web/app/controllers/resources/Reconcile.java diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java new file mode 100644 index 0000000000..da66eadb16 --- /dev/null +++ b/web/app/controllers/resources/Reconcile.java @@ -0,0 +1,214 @@ +/* Copyright 2017-2023 Fabian Steeg, hbz. Licensed under the EPL 2.0 */ + +package controllers.resources; + +import java.time.Instant; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.stream.Collectors; + +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.index.query.QueryStringQueryBuilder; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; + +import play.Logger; +import play.libs.Json; +import play.mvc.Controller; +import play.mvc.Result; +import play.mvc.Results; + +/** + * OpenRefine reconciliation service controller. + * + * Serves reconciliation service meta data and multi query requests. + * + * @author Fabian Steeg (fsteeg) + * + */ +public class Reconcile extends Controller { + + private static final JsonNode TYPES = + Json.toJson(Arrays.asList("BibliographicResource")); + + private static final DateTimeFormatter TIME_FORMATTER = DateTimeFormatter + .ofPattern("dd/MMM/yyyy:HH:mm:ss Z").withZone(ZoneId.systemDefault()); + + /** + * @param callback The name of the JSONP function to wrap the response in + * @param queries The queries. If this and extend are empty, return service + * metadata + * @param extend The extension data. If this and queries are empty, return + * service metadata + * @return OpenRefine reconciliation results (if queries is not empty), data + * extension information (if extend is not empty), or endpoint meta + * data (if queries and extend are empty), wrapped in `callback` + */ + public static Result main(String callback, String queries, String extend) { + ObjectNode result = queries.isEmpty() && extend.isEmpty() ? metadata() + : (!queries.isEmpty() ? queries(queries) : null); + response().setHeader("Access-Control-Allow-Origin", "*"); + final String resultString = prettyJsonString(result); + return (callback.isEmpty() ? ok(resultString) + : ok(String.format("/**/%s(%s);", callback, resultString))) + .as("application/json; charset=utf-8"); + } + + private static ObjectNode metadata() { + final String host = Application.CONFIG.getString("host"); + ObjectNode result = Json.newObject(); + result.putArray("versions").add("0.1").add("0.2"); + result.put("name", "lobid-resources reconciliation for OpenRefine"); + result.put("identifierSpace", "http://localhost:9000/resources"); + result.put("schemaSpace", "http://purl.org/dc/terms/BibliographicResource"); + result.set("defaultTypes", TYPES); + result.set("view", Json.newObject().put("url", host + "/resources/{{id}}")); + return result; + } + + private static String prettyJsonString(JsonNode jsonNode) { + try { + return new ObjectMapper().writerWithDefaultPrettyPrinter() + .writeValueAsString(jsonNode); + } catch (JsonProcessingException x) { + x.printStackTrace(); + return null; + } + } + + /** @return Reconciliation data for the queries in the request */ + public static Result reconcile() { + Map body = request().body().asFormUrlEncoded(); + response().setHeader("Access-Control-Allow-Origin", "*"); + Result result = body.containsKey("extend") ? Results.TODO + : ok(queries(body.get("queries")[0])); + // Apache-compatible POST logging, see + // https://github.com/hbz/lobid-gnd/issues/207#issuecomment-526571646 + Logger.info("{} {} - [{}] \"{} {}\" {}", + request().headers().getOrDefault("X-Forwarded-For", + new String[] { request().remoteAddress() }), + request().host(), TIME_FORMATTER.format(Instant.now()), + request().method(), request().path(), result.status()); + return result; + } + + private static ObjectNode queries(String src) { + JsonNode request = Json.parse(src); + Iterator> inputQueries = request.fields(); + ObjectNode response = Json.newObject(); + while (inputQueries.hasNext()) { + Entry inputQuery = inputQueries.next(); + Logger.info("q: " + inputQuery); + Search searchResponse = executeQuery(inputQuery, + preprocess(mainQuery(inputQuery)), propQuery(inputQuery)); + List results = + mapToResults(mainQuery(inputQuery), searchResponse); + ObjectNode resultsForInputQuery = Json.newObject(); + resultsForInputQuery.set("result", Json.toJson(results)); + Logger.info("r: " + resultsForInputQuery); + response.set(inputQuery.getKey(), resultsForInputQuery); + } + return response; + } + + private static List mapToResults(String mainQuery, + Search searchHits) { + List result = new ArrayList<>(); + searchHits.getResult().elements().forEachRemaining(hit -> { + Map map = new ObjectMapper().convertValue(hit, + new TypeReference>() {/**/ + }); + ObjectNode resultForHit = Json.newObject(); + resultForHit.set("id", hit.get("almaMmsId")); + Object nameObject = map.get("title"); + String name = nameObject == null ? "" : nameObject + ""; + resultForHit.put("name", name); + resultForHit.set("score", hit.get("score")); // TODO we have no score here + resultForHit.put("match", false); + resultForHit.set("type", hit.get("type")); + result.add(resultForHit); + }); + markMatch(result); + return result; + } + + private static void markMatch(List result) { + if (!result.isEmpty()) { + ObjectNode topResult = result.get(0); + int bestScore = topResult.get("score").asInt(); + if (bestScore > 50 && (result.size() == 1 + || bestScore - result.get(1).get("score").asInt() >= 5)) { + topResult.put("match", true); + } + } + } + + private static Search executeQuery(Entry entry, + String queryString, String propString) { + JsonNode limitNode = entry.getValue().get("limit"); + int limit = limitNode == null ? -1 : limitNode.asInt(); + JsonNode typeNode = entry.getValue().get("type"); + String filter = typeNode == null ? "" : "type:" + typeNode.asText(); + QueryStringQueryBuilder mainQuery = + QueryBuilders.queryStringQuery(queryString)// + .field("title", 4f)// + .field("otherTitleInformation", 2f)// + .field("responsibilityStatement")// + .field("rpbId")// + .field("hbzId")// + .field("almaMmsId")// + .field("sameAs.id")// + .field("id");// + QueryStringQueryBuilder propQuery = + QueryBuilders.queryStringQuery(propString).boost(5f); + + Search index = + new Search.Builder().query(mainQuery).from(0).size(limit).build(); + // TODO use filter (as filter) and propQuery (as 'should' query) + + return index.queryResources(); + } + + private static String propQuery(Entry entry) { + List segments = new ArrayList<>(); + JsonNode props = entry.getValue().get("properties"); + if (props != null) { + Logger.debug("Properties: {}", props); + for (JsonNode p : props) { + String field = p.get("pid").asText(); // TODO use field? + String value = p.get("v").asText().trim(); + if (!value.isEmpty()) { + segments.add("(" + value + ")"); + } + } + } + String queryString = segments.stream().collect(Collectors.joining(" OR ")); + Logger.debug("Property query string: {}", queryString); + return queryString; + } + + static String preprocess(String s) { + return s.startsWith("http") ? "\"" + s + "\"" + : /* index.validate(s) ? s : */ clean(s); // TODO add validation + } + + private static String clean(String in) { + String out = in.replaceAll("[:+\\-=<>(){}\\[\\]^]", " "); + Logger.info("Cleaned invalid query string '{}' to: '{}'", in, out); + return out; + } + + private static String mainQuery(Entry entry) { + return entry.getValue().get("query").asText(); + } +} diff --git a/web/conf/resources.routes b/web/conf/resources.routes index a29139b380..e95feb4784 100644 --- a/web/conf/resources.routes +++ b/web/conf/resources.routes @@ -12,6 +12,10 @@ GET /resources/advanced controllers.resources.Application.advanc GET /resources/search controllers.resources.Application.query(q?="", agent?="", name?="", subject?="", id?="", publisher?="", issued?="", medium ?= "", from:Int?=0, size:Int?=15, owner?="", t?="", sort ?= "", word?="", format ?= null, aggregations ?= "", location ?= "", nested ?= "", filter ?= "") GET /resources/facets controllers.resources.Application.facets(q,agent?="", name?="", subject?="", id?="", publisher?="", issued?="", medium ?= "", from:Int,size:Int,owner,t,field,sort,word?="", location ?= "", nested ?= "", filter ?= "") +#OpenRefine reconciliation endpoint +GET /resources/reconcile controllers.resources.Reconcile.main(callback ?= "", queries ?= "", extend ?= "") +POST /resources/reconcile controllers.resources.Reconcile.reconcile() + GET /resources/stars controllers.resources.Application.showStars(format?="", ids?="") GET /resources/stars/clear controllers.resources.Application.clearStars(ids ?= "") GET /resources/stars/all controllers.resources.Application.starAll(ids) From 1e20630c92df3f670b82d88f26192be1bea22357 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Wed, 18 Jan 2023 11:52:06 +0100 Subject: [PATCH 02/11] Improve reconciliation for RPB use case (#1606) --- web/app/controllers/resources/Reconcile.java | 38 +++++++++++++------- web/app/controllers/resources/Search.java | 8 +++-- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index da66eadb16..a7c9c9c853 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -13,6 +13,8 @@ import java.util.Map.Entry; import java.util.stream.Collectors; +import org.elasticsearch.index.query.BoolQueryBuilder; +import org.elasticsearch.index.query.Operator; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.QueryStringQueryBuilder; @@ -129,11 +131,14 @@ private static List mapToResults(String mainQuery, new TypeReference>() {/**/ }); ObjectNode resultForHit = Json.newObject(); - resultForHit.set("id", hit.get("almaMmsId")); + String[] elements = hit.get("id").asText().split("/"); + resultForHit.put("id", elements[elements.length - 1].replace("#!", "")); Object nameObject = map.get("title"); String name = nameObject == null ? "" : nameObject + ""; resultForHit.put("name", name); - resultForHit.set("score", hit.get("score")); // TODO we have no score here + // TODO: temp, need a proper score solution, with query, see + // https://github.com/hbz/lobid-resources/issues/635 + resultForHit.set("score", hit.get("_score")); resultForHit.put("match", false); resultForHit.set("type", hit.get("type")); result.add(resultForHit); @@ -169,14 +174,22 @@ private static Search executeQuery(Entry entry, .field("almaMmsId")// .field("sameAs.id")// .field("id");// - QueryStringQueryBuilder propQuery = - QueryBuilders.queryStringQuery(propString).boost(5f); - Search index = - new Search.Builder().query(mainQuery).from(0).size(limit).build(); - // TODO use filter (as filter) and propQuery (as 'should' query) + BoolQueryBuilder query = QueryBuilders.boolQuery().must(mainQuery) + // TODO: temp, don't reconcile against RPB records: + .mustNot(queryStringQuery("_exists_:rpbId")); + if (!filter.isEmpty()) { + query = query.filter(queryStringQuery(filter)); + } + if (propString != null && !propString.trim().isEmpty()) { + query = query.should(queryStringQuery(propString).boost(5f)); + } + return new Search.Builder().query(query).from(0).size(limit).build() + .queryResources(); + } - return index.queryResources(); + private static QueryStringQueryBuilder queryStringQuery(String q) { + return QueryBuilders.queryStringQuery(q).defaultOperator(Operator.AND); } private static String propQuery(Entry entry) { @@ -185,10 +198,11 @@ private static String propQuery(Entry entry) { if (props != null) { Logger.debug("Properties: {}", props); for (JsonNode p : props) { - String field = p.get("pid").asText(); // TODO use field? - String value = p.get("v").asText().trim(); + String field = p.get("pid").asText(); + String value = preprocess(p.get("v").asText().trim()); if (!value.isEmpty()) { - segments.add("(" + value + ")"); + segments + .add("(" + (field.equals("hbzId") ? "hbzId:" : "") + value + ")"); } } } @@ -203,7 +217,7 @@ static String preprocess(String s) { } private static String clean(String in) { - String out = in.replaceAll("[:+\\-=<>(){}\\[\\]^]", " "); + String out = in.replaceAll("[!/:+\\-=<>(){}\\[\\]^]", " "); Logger.info("Cleaned invalid query string '{}' to: '{}'", in, out); return out; } diff --git a/web/app/controllers/resources/Search.java b/web/app/controllers/resources/Search.java index 5a71751ffa..f2f6184fa9 100644 --- a/web/app/controllers/resources/Search.java +++ b/web/app/controllers/resources/Search.java @@ -1,4 +1,4 @@ -/* Copyright 2015-2019 Fabian Steeg, hbz. Licensed under the EPL 2.0 */ +/* Copyright 2015-2023 Fabian Steeg, hbz. Licensed under the EPL 2.0 */ package controllers.resources; @@ -163,7 +163,11 @@ public Search queryResources() { List results = new ArrayList<>(); this.aggregations = response.getAggregations(); for (SearchHit sh : hits.getHits()) { - results.add(Json.toJson(sh.getSource())); + Map source = sh.getSource(); + // TODO: temp, need a proper score solution, with query, see + // https://github.com/hbz/lobid-resources/issues/635 + source.put("_score", sh.getScore()); + results.add(Json.toJson(source)); } result = Json.toJson(results); total = hits.getTotalHits(); From 0e48259c91ab3c9d740258c050d3bb348f5faa62 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 19 Jan 2023 16:42:15 +0100 Subject: [PATCH 03/11] Tweak field boosting in reconciliation queries (#1606) For improved auto-matching when reconciling with IDs only --- web/app/controllers/resources/Reconcile.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index a7c9c9c853..5efaa6178f 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -169,11 +169,11 @@ private static Search executeQuery(Entry entry, .field("title", 4f)// .field("otherTitleInformation", 2f)// .field("responsibilityStatement")// - .field("rpbId")// - .field("hbzId")// - .field("almaMmsId")// - .field("sameAs.id")// - .field("id");// + .field("rpbId", 4f)// + .field("hbzId", 4f)// + .field("almaMmsId", 4f)// + .field("sameAs.id", 2f)// + .field("id", 4f);// BoolQueryBuilder query = QueryBuilders.boolQuery().must(mainQuery) // TODO: temp, don't reconcile against RPB records: From aa4396f8cac1089d3004906953a9da01a2cafd30 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Fri, 20 Jan 2023 10:26:07 +0100 Subject: [PATCH 04/11] Add preview feature to reconciliation service (#1606) --- .../controllers/resources/Application.java | 29 +++++++++++++++++++ web/app/controllers/resources/Reconcile.java | 7 ++++- web/conf/resources.routes | 1 + 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/web/app/controllers/resources/Application.java b/web/app/controllers/resources/Application.java index 66350e4978..6afbdf1148 100644 --- a/web/app/controllers/resources/Application.java +++ b/web/app/controllers/resources/Application.java @@ -72,6 +72,7 @@ import views.html.query; import views.html.rss; import views.html.stars; +import views.html.tags.result_doc; /** * The main application controller. @@ -456,6 +457,34 @@ public static Promise resourceDotFormat(final String id, return resource(id, format); } + /** + * @param id The resource ID. + * @return The preview page for the resource with the given ID. + */ + public static Promise preview(final String id) { + String cacheId = String.format("show(%s,%s)", id, "preview"); + @SuppressWarnings("unchecked") + Promise cachedResult = (Promise) Cache.get(cacheId); + if (cachedResult != null) + return cachedResult; + Promise promise = Promise.promise(() -> { + JsonNode result = + new Search.Builder().build().getResource(id).getResult(); + if (result == null) { + String movedTo = idSearchResult(id); + if (movedTo != null) { + return movedPermanently(routes.Application.preview(movedTo)); + } + } + return result != null + ? ok(result_doc + .render(play.api.libs.json.Json.parse(result.toString()))) + : notFound(details.render(CONFIG, "", id)); + }); + cacheOnRedeem(cacheId, promise, ONE_DAY); + return promise; + } + /** * @param id The resource ID. * @param format The response format (see {@code Accept.Format}) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index 5efaa6178f..e687712f6b 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -70,11 +70,16 @@ private static ObjectNode metadata() { final String host = Application.CONFIG.getString("host"); ObjectNode result = Json.newObject(); result.putArray("versions").add("0.1").add("0.2"); - result.put("name", "lobid-resources reconciliation for OpenRefine"); + result.put("name", + "lobid-resources reconciliation for OpenRefine (localhost)"); result.put("identifierSpace", "http://localhost:9000/resources"); result.put("schemaSpace", "http://purl.org/dc/terms/BibliographicResource"); result.set("defaultTypes", TYPES); result.set("view", Json.newObject().put("url", host + "/resources/{{id}}")); + result.set("preview", Json.newObject()// + .put("height", 300)// + .put("width", 600)// + .put("url", host + "/resources/{{id}}.preview")); return result; } diff --git a/web/conf/resources.routes b/web/conf/resources.routes index e95feb4784..e65ea6f902 100644 --- a/web/conf/resources.routes +++ b/web/conf/resources.routes @@ -25,6 +25,7 @@ DELETE /resources/stars/:id controllers.resources.Application.unstar GET /resources/context.jsonld controllers.resources.Application.context() GET /resources/dataset.jsonld controllers.resources.Application.dataset(format="json") GET /resources/dataset controllers.resources.Application.dataset(format?="") +GET /resources/:id.preview controllers.resources.Application.preview(id) GET /resources/:id.:format controllers.resources.Application.resourceDotFormat(id, format) GET /items/:id.$format controllers.resources.Application.itemDotFormat(id, format) GET /resources/:id controllers.resources.Application.resource(id, format ?= null) From 336c1b3ac20b0b272c248ff475882883880b122d Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Tue, 24 Jan 2023 14:30:22 +0100 Subject: [PATCH 05/11] Always add `pid` as field in query, tweak logging (#1606) --- web/app/controllers/resources/Reconcile.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index e687712f6b..18b34aa478 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -206,8 +206,7 @@ private static String propQuery(Entry entry) { String field = p.get("pid").asText(); String value = preprocess(p.get("v").asText().trim()); if (!value.isEmpty()) { - segments - .add("(" + (field.equals("hbzId") ? "hbzId:" : "") + value + ")"); + segments.add("(" + field + ":" + value + ")"); } } } @@ -223,7 +222,9 @@ static String preprocess(String s) { private static String clean(String in) { String out = in.replaceAll("[!/:+\\-=<>(){}\\[\\]^]", " "); - Logger.info("Cleaned invalid query string '{}' to: '{}'", in, out); + if (!in.equals(out)) { + Logger.info("Cleaned query string '{}' to: '{}'", in, out); + } return out; } From 7f48869c4b22fd99cf69e0d52b88e2bebeca4137 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Wed, 25 Jan 2023 10:53:54 +0100 Subject: [PATCH 06/11] Use host from config for name and identifierSpace (#1606) --- web/app/controllers/resources/Reconcile.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index 18b34aa478..8e2e0c52c9 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -71,8 +71,8 @@ private static ObjectNode metadata() { ObjectNode result = Json.newObject(); result.putArray("versions").add("0.1").add("0.2"); result.put("name", - "lobid-resources reconciliation for OpenRefine (localhost)"); - result.put("identifierSpace", "http://localhost:9000/resources"); + "lobid-resources reconciliation for OpenRefine (" + host + ")"); + result.put("identifierSpace", host + "/resources"); result.put("schemaSpace", "http://purl.org/dc/terms/BibliographicResource"); result.set("defaultTypes", TYPES); result.set("view", Json.newObject().put("url", host + "/resources/{{id}}")); From 40cbd207ce68b286340cdcaa6dd2201aac1acee5 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Wed, 25 Jan 2023 12:47:13 +0100 Subject: [PATCH 07/11] Reuse search logic without adding _score to all responses (#1606) --- web/app/controllers/resources/Reconcile.java | 9 ++++++++- web/app/controllers/resources/Search.java | 10 +++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index 8e2e0c52c9..582c00c169 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -17,6 +17,7 @@ import org.elasticsearch.index.query.Operator; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.QueryStringQueryBuilder; +import org.elasticsearch.search.SearchHit; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; @@ -190,7 +191,13 @@ private static Search executeQuery(Entry entry, query = query.should(queryStringQuery(propString).boost(5f)); } return new Search.Builder().query(query).from(0).size(limit).build() - .queryResources(); + .queryResources((SearchHit hit) -> { + Map source = hit.getSource(); + // TODO: temp, need a proper score solution, with query, see + // https://github.com/hbz/lobid-resources/issues/635 + source.put("_score", hit.getScore()); + return Json.toJson(source); + }); } private static QueryStringQueryBuilder queryStringQuery(String q) { diff --git a/web/app/controllers/resources/Search.java b/web/app/controllers/resources/Search.java index f2f6184fa9..1ee1fa4d10 100644 --- a/web/app/controllers/resources/Search.java +++ b/web/app/controllers/resources/Search.java @@ -144,6 +144,10 @@ public long totalHits() { * {@link #getTotal()} */ public Search queryResources() { + return queryResources((SearchHit hit) -> Json.toJson(hit.getSource())); + } + + Search queryResources(Function transformer) { Search resultIndex = withClient((Client client) -> { validate(client, query); Logger.trace("queryResources: q={}, from={}, size={}, sort={}, query={}", @@ -163,11 +167,7 @@ public Search queryResources() { List results = new ArrayList<>(); this.aggregations = response.getAggregations(); for (SearchHit sh : hits.getHits()) { - Map source = sh.getSource(); - // TODO: temp, need a proper score solution, with query, see - // https://github.com/hbz/lobid-resources/issues/635 - source.put("_score", sh.getScore()); - results.add(Json.toJson(source)); + results.add(transformer.apply(sh)); } result = Json.toJson(results); total = hits.getTotalHits(); From 60f55f75ca6a6e6e13fa620dfa183a279313b60d Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 26 Jan 2023 15:18:15 +0100 Subject: [PATCH 08/11] Remove quotes when cleaning queries, leave GND IDs alone (#1606) --- web/app/controllers/resources/Reconcile.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index 582c00c169..dc709fed7b 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -223,12 +223,18 @@ private static String propQuery(Entry entry) { } static String preprocess(String s) { - return s.startsWith("http") ? "\"" + s + "\"" + return s.startsWith("http") || isGndId(s) ? "\"" + s + "\"" : /* index.validate(s) ? s : */ clean(s); // TODO add validation } + private static boolean isGndId(String string) { + return string.matches( + // https://www.wikidata.org/wiki/Property:P227#P1793 + "1[012]?\\d{7}[0-9X]|[47]\\d{6}-\\d|[1-9]\\d{0,7}-[0-9X]|3\\d{7}[0-9X]"); + } + private static String clean(String in) { - String out = in.replaceAll("[!/:+\\-=<>(){}\\[\\]^]", " "); + String out = in.replaceAll("[\"!/:+\\-=<>(){}\\[\\]^]", " "); if (!in.equals(out)) { Logger.info("Cleaned query string '{}' to: '{}'", in, out); } From 8ce61e5bbebbf12f17df98e63edd3ec7b0bf0f52 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Tue, 14 Feb 2023 15:21:28 +0100 Subject: [PATCH 09/11] Increase matching threshold and boosts, don't filter types (#1606) Use `should` query for types instead, boosted like the title field Based on usage feedback, see https://jira.hbz-nrw.de/browse/RPB-50 --- web/app/controllers/resources/Reconcile.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index dc709fed7b..16c6add624 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -158,7 +158,7 @@ private static void markMatch(List result) { ObjectNode topResult = result.get(0); int bestScore = topResult.get("score").asInt(); if (bestScore > 50 && (result.size() == 1 - || bestScore - result.get(1).get("score").asInt() >= 5)) { + || bestScore - result.get(1).get("score").asInt() > 10)) { topResult.put("match", true); } } @@ -172,20 +172,20 @@ private static Search executeQuery(Entry entry, String filter = typeNode == null ? "" : "type:" + typeNode.asText(); QueryStringQueryBuilder mainQuery = QueryBuilders.queryStringQuery(queryString)// - .field("title", 4f)// + .field("title", 8f)// .field("otherTitleInformation", 2f)// .field("responsibilityStatement")// - .field("rpbId", 4f)// - .field("hbzId", 4f)// - .field("almaMmsId", 4f)// + .field("rpbId", 8f)// + .field("hbzId", 8f)// + .field("almaMmsId", 8f)// .field("sameAs.id", 2f)// - .field("id", 4f);// + .field("id", 8f);// BoolQueryBuilder query = QueryBuilders.boolQuery().must(mainQuery) // TODO: temp, don't reconcile against RPB records: .mustNot(queryStringQuery("_exists_:rpbId")); if (!filter.isEmpty()) { - query = query.filter(queryStringQuery(filter)); + query = query.should(queryStringQuery(filter).boost(8f)); } if (propString != null && !propString.trim().isEmpty()) { query = query.should(queryStringQuery(propString).boost(5f)); From 2dac323e7b2ede0b1b733cd1b1762715328c2d8b Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 16 Feb 2023 17:10:33 +0100 Subject: [PATCH 10/11] Extract boosts, increase for title & otherTitleInformation (#1606) --- web/app/controllers/resources/Reconcile.java | 21 +++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index 16c6add624..429252583c 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -170,25 +170,28 @@ private static Search executeQuery(Entry entry, int limit = limitNode == null ? -1 : limitNode.asInt(); JsonNode typeNode = entry.getValue().get("type"); String filter = typeNode == null ? "" : "type:" + typeNode.asText(); + float topBoost = 12f; + float mediumBoost = 6f; + float minBoost = 2f; QueryStringQueryBuilder mainQuery = QueryBuilders.queryStringQuery(queryString)// - .field("title", 8f)// - .field("otherTitleInformation", 2f)// + .field("title", topBoost)// + .field("otherTitleInformation", mediumBoost)// .field("responsibilityStatement")// - .field("rpbId", 8f)// - .field("hbzId", 8f)// - .field("almaMmsId", 8f)// - .field("sameAs.id", 2f)// - .field("id", 8f);// + .field("rpbId", topBoost)// + .field("hbzId", topBoost)// + .field("almaMmsId", topBoost)// + .field("sameAs.id", minBoost)// + .field("id", topBoost);// BoolQueryBuilder query = QueryBuilders.boolQuery().must(mainQuery) // TODO: temp, don't reconcile against RPB records: .mustNot(queryStringQuery("_exists_:rpbId")); if (!filter.isEmpty()) { - query = query.should(queryStringQuery(filter).boost(8f)); + query = query.should(queryStringQuery(filter).boost(topBoost)); } if (propString != null && !propString.trim().isEmpty()) { - query = query.should(queryStringQuery(propString).boost(5f)); + query = query.should(queryStringQuery(propString).boost(mediumBoost)); } return new Search.Builder().query(query).from(0).size(limit).build() .queryResources((SearchHit hit) -> { From f4406806c5aee0655896b0e17e62f3e05e5bdf55 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Wed, 22 Feb 2023 17:36:30 +0100 Subject: [PATCH 11/11] Revert previous boost changes, add alternativeTitle field (#1606) --- web/app/controllers/resources/Reconcile.java | 22 +++++++++----------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/web/app/controllers/resources/Reconcile.java b/web/app/controllers/resources/Reconcile.java index 429252583c..7dd3077458 100644 --- a/web/app/controllers/resources/Reconcile.java +++ b/web/app/controllers/resources/Reconcile.java @@ -170,28 +170,26 @@ private static Search executeQuery(Entry entry, int limit = limitNode == null ? -1 : limitNode.asInt(); JsonNode typeNode = entry.getValue().get("type"); String filter = typeNode == null ? "" : "type:" + typeNode.asText(); - float topBoost = 12f; - float mediumBoost = 6f; - float minBoost = 2f; QueryStringQueryBuilder mainQuery = QueryBuilders.queryStringQuery(queryString)// - .field("title", topBoost)// - .field("otherTitleInformation", mediumBoost)// + .field("title", 8f)// + .field("alternativeTitle", 4f)// + .field("otherTitleInformation", 2f)// .field("responsibilityStatement")// - .field("rpbId", topBoost)// - .field("hbzId", topBoost)// - .field("almaMmsId", topBoost)// - .field("sameAs.id", minBoost)// - .field("id", topBoost);// + .field("rpbId", 8f)// + .field("hbzId", 8f)// + .field("almaMmsId", 8f)// + .field("sameAs.id", 2f)// + .field("id", 8f);// BoolQueryBuilder query = QueryBuilders.boolQuery().must(mainQuery) // TODO: temp, don't reconcile against RPB records: .mustNot(queryStringQuery("_exists_:rpbId")); if (!filter.isEmpty()) { - query = query.should(queryStringQuery(filter).boost(topBoost)); + query = query.should(queryStringQuery(filter).boost(8f)); } if (propString != null && !propString.trim().isEmpty()) { - query = query.should(queryStringQuery(propString).boost(mediumBoost)); + query = query.should(queryStringQuery(propString).boost(5f)); } return new Search.Builder().query(query).from(0).size(limit).build() .queryResources((SearchHit hit) -> {