Skip to content

Commit

Permalink
fix deserialization
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed Feb 20, 2024
1 parent c859fe6 commit 6f371c4
Showing 1 changed file with 18 additions and 10 deletions.
28 changes: 18 additions & 10 deletions src/jvmMain/kotlin/ai/hypergraph/markovian/mcmc/MarkovChain.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@ package ai.hypergraph.markovian.mcmc
import ai.hypergraph.kaliningraph.cache.LRUCache
import ai.hypergraph.kaliningraph.parsing.Σᐩ
import ai.hypergraph.kaliningraph.sampling.pow
import ai.hypergraph.kaliningraph.tokenizeByWhitespace
import ai.hypergraph.markovian.*
import ai.hypergraph.markovian.concurrency.*
import org.apache.datasketches.frequencies.ErrorType.NO_FALSE_POSITIVES
import org.apache.datasketches.frequencies.ItemsSketch
import org.jetbrains.kotlinx.multik.api.*
import org.jetbrains.kotlinx.multik.ndarray.data.*
import org.jetbrains.kotlinx.multik.ndarray.operations.*
import java.io.File
import java.util.concurrent.atomic.*
import java.util.stream.Stream
import kotlin.math.*
import kotlin.random.Random
import kotlin.streams.asStream
import kotlin.time.measureTimedValue


/**
Expand Down Expand Up @@ -245,18 +249,22 @@ open class MarkovChain<T>(
fun deserialize(csv: String): MarkovChain<Σᐩ> {
val lines = csv.lines()
val memory: Int = lines.first().substringBefore(CSVSEP).split(" ").size
val size = 2.pow(log2(lines.size) + 2)
val tokenSize = lines.flatMap { it.substringBefore(CSVSEP).split(" ") }
.toSet().size.let { 2.pow(log2(it) + 2) }
val rawCounts = ItemsSketch<Σᐩ>(tokenSize)
val ngramSize = 2.pow(log2(lines.size) + 2)
val nrmCounts = ItemsSketch<List<Σᐩ?>>(ngramSize)
var total = 0L
lines.map { it.substringBefore(CSVSEP).split(" ") to it.substringAfter(CSVSEP).toLong() }
.forEach { (ngram, count) ->
total += count
nrmCounts.update(ngram, count)
ngram.forEach { rawCounts.update(it, count) }
}
return MarkovChain(
sequenceOf(),
train = sequenceOf(),
memory = memory,
Counter(
total = AtomicInteger(lines.sumOf { it.substringAfter(CSVSEP).toLong().toInt() }),
memory = memory,
nrmCounts = ItemsSketch<List<Σᐩ?>>(size).apply {
lines.map { it.substringBefore(CSVSEP).split(" ") to it.substringAfter(CSVSEP).toLong() }
.forEach { (ngram, count) -> update(ngram, count) }
}
)
Counter(total = AtomicInteger(total.toInt()), memory = memory, rawCounts = rawCounts, nrmCounts = nrmCounts)
)
}
}
Expand Down

0 comments on commit 6f371c4

Please sign in to comment.