Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment: default IDs in LUT references #81

Merged
merged 2 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ package eu.ostrzyciel.jelly.core

import java.util

private[core] object EncoderValue:
// Empty default value to slightly reduce heap pressure
val Empty = EncoderValue(0, 0, false)

private[core] final case class EncoderValue(getId: Int, setId: Int, newEntry: Boolean)

private[core] final class EncoderLookup(maxEntries: Int)
Expand Down
15 changes: 13 additions & 2 deletions core/src/main/scala/eu/ostrzyciel/jelly/core/NameDecoder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ private[core] final class NameDecoder(opt: RdfStreamOptions):
private val prefixLookup = new DecoderLookup[String](opt.maxPrefixTableSize)
private val nameLookup = new DecoderLookup[String](opt.maxNameTableSize)

private var lastIriPrefixId: Int = 0
private var lastIriNameId: Int = 0

/**
* Update the name table.
* @param nameRow name row
Expand All @@ -32,16 +35,24 @@ private[core] final class NameDecoder(opt: RdfStreamOptions):
*/
def decode(iri: RdfIri): String =
val prefix = iri.prefixId match
case 0 => ""
case 0 if lastIriPrefixId < 1 => ""
// the .get() result can't be null here, we've already retrieved it before
case 0 => prefixLookup.get(lastIriPrefixId)
case id =>
val p = prefixLookup.get(id)
if p == null then throw MissingPrefixEntryError(id)
lastIriPrefixId = id
p
val name = iri.nameId match
case 0 => ""
case 0 =>
lastIriNameId += 1
val n = nameLookup.get(lastIriNameId)
if n == null then throw MissingNameEntryError(lastIriNameId)
n
case id =>
val n = nameLookup.get(id)
if n == null then throw MissingNameEntryError(id)
lastIriNameId = id
n

prefix + name
93 changes: 62 additions & 31 deletions core/src/main/scala/eu/ostrzyciel/jelly/core/NameEncoder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,56 @@ import eu.ostrzyciel.jelly.core.proto.v1.*

import scala.collection.mutable.ListBuffer

private[core] object NameEncoder:
private val repeatDatatype = RdfLiteral.LiteralKind.Datatype(0)

/**
* IRI and datatype encoder.
* Maintains internal lookups for prefixes, names, and datatypes. Uses the LRU strategy for eviction.
*
* @param opt Jelly options
*/
private[core] final class NameEncoder(opt: RdfStreamOptions):
import NameEncoder.*

private val nameLookup = new EncoderLookup(opt.maxNameTableSize)
private val prefixLookup = new EncoderLookup(opt.maxPrefixTableSize)
private val dtLookup = new EncoderLookup(opt.maxDatatypeTableSize)
private val dtTable = new DecoderLookup[RdfLiteral.LiteralKind.Datatype](opt.maxDatatypeTableSize)

private var lastIriPrefixId: Int = -1000
private var lastIriNameId: Int = 0

/**
* Try to extract the prefix out of the IRI.
*
* Somewhat based on [[org.apache.jena.riot.system.PrefixMapStd.getPossibleKey]]
* @param iri IRI
* @return prefix or null (micro-optimization, don't hit me)
* @return prefix which can be empty, never null
*/
private def getIriPrefix(iri: String): String =
iri.lastIndexOf('#') match
case i if i > -1 => iri.substring(0, i + 1)
case _ =>
iri.lastIndexOf('/') match
case i if i > -1 => iri.substring(0, i + 1)
case _ => null
case _ => ""

/**
* Obtain the id for the name lookup table to be communicated to the consumer.
* This method checks if new id = last_id + 1, and if so, it returns 0.
*
* @param getId the getId from the EncoderLookup
* @return the id to be communicated to the consumer
*/
private inline def getNameIdWithRepeat(getId: Int): Int =
if lastIriNameId + 1 == getId then
// If the last node had id - 1, we can tell it to the consumer in a shorthand manner
lastIriNameId = getId
0
else
lastIriNameId = getId
getId

/**
* Encodes an IRI to a protobuf representation.
Expand All @@ -39,40 +63,47 @@ private[core] final class NameEncoder(opt: RdfStreamOptions):
* @return protobuf representation of the IRI
*/
def encodeIri(iri: String, rowsBuffer: ListBuffer[RdfStreamRow]): RdfIri =
def plainIriEncode: RdfIri =
nameLookup.addEntry(iri) match
case EncoderValue(getId, _, false) =>
RdfIri(nameId = getId)
case EncoderValue(getId, setId, true) =>
rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(id = setId, value = iri)
))
)
RdfIri(nameId = getId)

if opt.maxPrefixTableSize == 0 then
// Use a lighter algorithm if the prefix table is disabled
return plainIriEncode

getIriPrefix(iri) match
case null => plainIriEncode
case prefix =>
val postfix = iri.substring(prefix.length)
val pVal = prefixLookup.addEntry(prefix)
val iVal = if postfix.nonEmpty then nameLookup.addEntry(postfix) else EncoderValue.Empty

if pVal.newEntry then rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Prefix(
RdfPrefixEntry(pVal.setId, prefix)
))
)
if iVal.newEntry then rowsBuffer.append(
val nameLookupEntry = nameLookup.addEntry(iri)
if nameLookupEntry.newEntry then
rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(iVal.setId, postfix)
RdfNameEntry(id = nameLookupEntry.setId, value = iri)
))
)
RdfIri(prefixId = pVal.getId, nameId = iVal.getId)
// We set the prefixId to 0, but it's a special case, because the prefix table is disabled.
// The consumer will interpret this as no prefix.
RdfIri(nameId = getNameIdWithRepeat(nameLookupEntry.getId))
else
val prefix = getIriPrefix(iri)
val postfix = iri.substring(prefix.length)
val prefixLookupEntry = prefixLookup.addEntry(prefix)
val nameLookupEntry = nameLookup.addEntry(postfix)

if prefixLookupEntry.newEntry then rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Prefix(
RdfPrefixEntry(prefixLookupEntry.setId, prefix)
))
)
if nameLookupEntry.newEntry then rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(nameLookupEntry.setId, postfix)
))
)

val nameIdWithRepeat = getNameIdWithRepeat(nameLookupEntry.getId)
if lastIriPrefixId == prefixLookupEntry.getId then
// If the last IRI had the same prefix, we can tell the consumer to reuse it.
// prefixId = 0 by default in this constructor.
// No need to update lastIriPrefixId, because it's the same.
RdfIri(nameId = nameIdWithRepeat)
else
lastIriPrefixId = prefixLookupEntry.getId
RdfIri(
prefixId = prefixLookupEntry.getId,
nameId = nameIdWithRepeat
)

/**
* Encodes a datatype IRI to a protobuf representation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,27 @@ class NameDecoderSpec extends AnyWordSpec, Matchers:
error.nameId should be (5)
}

"return empty string for no prefix and no name" in {
"throw MissingNameEntryError when trying to retrieve a name with empty LUT" in {
val dec = NameDecoder(smallOptions)
val error = intercept[MissingNameEntryError] {
dec.decode(RdfIri(0, 0))
}
error.getMessage should include ("name table at ID: 1")
error.nameId should be (1)
}

"return empty string for no prefix and empty name" in {
val dec = NameDecoder(smallOptions)
dec.updateNames(RdfNameEntry(0, ""))
dec.decode(RdfIri(0, 0)) should be ("")
}

"accept new prefixes with default IDs" in {
val dec = NameDecoder(smallOptions)
dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/"))
dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/2/"))
dec.updateNames(RdfNameEntry(0, ""))
dec.updateNames(RdfNameEntry(0, ""))
dec.decode(RdfIri(1, 0)) should be("https://test.org/")
dec.decode(RdfIri(2, 0)) should be("https://test.org/2/")
}
Expand All @@ -45,13 +57,16 @@ class NameDecoderSpec extends AnyWordSpec, Matchers:
dec.updatePrefixes(RdfPrefixEntry(4, "https://test.org/"))
// This ID will resolve to 5
dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/2/"))
dec.updateNames(RdfNameEntry(0, ""))
dec.updateNames(RdfNameEntry(0, ""))
dec.decode(RdfIri(4, 0)) should be("https://test.org/")
dec.decode(RdfIri(5, 0)) should be("https://test.org/2/")
}

"accept a new prefix and return it (IRI with no name part)" in {
val dec = NameDecoder(smallOptions)
dec.updatePrefixes(RdfPrefixEntry(3, "https://test.org/"))
dec.updateNames(RdfNameEntry(0, ""))
dec.decode(RdfIri(3, 0)) should be ("https://test.org/")
}

Expand Down
39 changes: 25 additions & 14 deletions core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
"add a full IRI" in {
val (encoder, buffer) = getEncoder()
val iri = encoder.encodeIri("https://test.org/Cake", buffer)
iri.nameId should be (1)
iri.nameId should be (0)
iri.prefixId should be (1)

buffer.size should be (2)
Expand All @@ -102,19 +102,27 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
iri.nameId should be (0)
iri.prefixId should be (1)

buffer.size should be (1)
// an empty name entry still has to be allocated
buffer.size should be (2)
buffer should contain (RdfStreamRow(RdfStreamRow.Row.Prefix(
RdfPrefixEntry(id = 0, value = "https://test.org/test/")
)))
buffer should contain(RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(id = 0, value = "")
)))
}

"add a name-only IRI" in {
val (encoder, buffer) = getEncoder()
val iri = encoder.encodeIri("testTestTest", buffer)
iri.nameId should be (1)
iri.prefixId should be (0)
iri.nameId should be (0)
iri.prefixId should be (1)

buffer.size should be (1)
// in the mode with the prefix table enabled, an empty prefix entry still has to be allocated
buffer.size should be (2)
buffer should contain(RdfStreamRow(RdfStreamRow.Row.Prefix(
RdfPrefixEntry(id = 0, value = "")
)))
buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(id = 0, value = "testTestTest")
)))
Expand All @@ -123,9 +131,10 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
"add a full IRI in no-prefix table mode" in {
val (encoder, buffer) = getEncoder(0)
val iri = encoder.encodeIri("https://test.org/Cake", buffer)
iri.nameId should be (1)
iri.nameId should be (0)
iri.prefixId should be (0)

// in the no prefix mode, there must be no prefix entries
buffer.size should be (1)
buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(id = 0, value = "https://test.org/Cake")
Expand All @@ -136,18 +145,19 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
val (encoder, buffer) = getEncoder(3)
val data = Seq(
// IRI, expected prefix ID, expected name ID
("https://test.org/Cake1", 1, 1),
("https://test.org/Cake1", 1, 0),
("https://test.org#Cake1", 2, 1),
("https://test.org/test/Cake1", 3, 1),
("https://test.org/Cake2", 1, 2),
("https://test.org/Cake2", 1, 0),
("https://test.org#Cake2", 2, 2),
("https://test.org/other/Cake1", 3, 1),
("https://test.org/other/Cake2", 3, 2),
("https://test.org/other/Cake3", 3, 3),
("https://test.org/other/Cake4", 3, 4),
("https://test.org/other/Cake5", 3, 1),
("https://test.org#Cake2", 2, 2),
("Cake2", 0, 2),
("https://test.org/other/Cake2", 0, 0),
("https://test.org/other/Cake3", 0, 0),
("https://test.org/other/Cake4", 0, 0),
("https://test.org/other/Cake5", 0, 1),
("https://test.org#Cake2", 2, 0),
// prefix "" evicts the previous number #1
("Cake2", 1, 2),
)

for (sIri, ePrefix, eName) <- data do
Expand All @@ -166,6 +176,7 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
(false, 0, "Cake3"),
(false, 0, "Cake4"),
(false, 1, "Cake5"),
(true, 1, ""),
)

buffer.size should be (expectedBuffer.size)
Expand Down
Loading