Skip to content

Commit

Permalink
Adapt the main class to be able to use the new 'LSI' implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
loehnertz committed Jun 6, 2019
1 parent b14a30f commit b9639de
Showing 1 changed file with 26 additions and 10 deletions.
36 changes: 26 additions & 10 deletions src/main/kotlin/SemanticCouplingCalculator.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package codes.jakob.semanticcoupling

import codes.jakob.semanticcoupling.lsi.LatentSemanticIndexer
import codes.jakob.semanticcoupling.model.*
import codes.jakob.semanticcoupling.model.NaturalLanguage.Companion.getNaturalLanguageByName
import codes.jakob.semanticcoupling.model.ProgrammingLanguage.Companion.getProgrammingLanguageByName
Expand All @@ -17,10 +18,14 @@ class SemanticCouplingCalculator(private val files: List<Map<String, String>>, p
constructor(files: List<Map<String, String>>, selectedProgrammingLanguage: String, selectedNaturalLanguage: String, fileSimilaritiesToCalculate: List<List<String>>? = null) : this(files, getProgrammingLanguageByName(selectedProgrammingLanguage), getNaturalLanguageByName(selectedNaturalLanguage), fileSimilaritiesToCalculate?.map { Pair(it.first(), it.last()) })

private var useLemmatization = true
private val similarities: ArrayList<SemanticCoupling> = arrayListOf()
private var useLsi = true
private var numberOfLsiDimensions: Int = DefaultNumberOfLsiDimensions
private var maxLsiEpochs: Int = DefaultMaxLsiEpochs
private lateinit var corpus: Corpus
private var documentSimilarities: ArrayList<SemanticCoupling> = arrayListOf()

fun calculate() {
similarities.clear()
documentSimilarities.clear()

val deferredDocuments: ArrayList<Deferred<Document>> = arrayListOf()
for (file: Map<String, String> in files) {
Expand All @@ -29,21 +34,20 @@ class SemanticCouplingCalculator(private val files: List<Map<String, String>>, p
}
}

runBlocking {
var corpus = Corpus(ArrayList(deferredDocuments.map { it.await() }))
corpus = TfIdfCalculator(corpus, fileSimilaritiesToCalculate).calculateForAllTerms()
runBlocking { corpus = Corpus(deferredDocuments.map { it.await() }.toMutableSet()) }

val documentSimilarities: List<SemanticCoupling> = SimilarityCalculator(corpus, fileSimilaritiesToCalculate).calculateDocumentSimilarities()
documentSimilarities.forEach { similarities.add(it) }
}
corpus = TfIdfCalculator(corpus, if (useLsi) null else fileSimilaritiesToCalculate).calculateForAllTerms()

val similarities: List<SemanticCoupling> = SimilarityCalculator(corpus, fileSimilaritiesToCalculate, useLsi, numberOfLsiDimensions, maxLsiEpochs).calculateDocumentSimilarities()
similarities.forEach { documentSimilarities.add(it) }
}

fun retrieveSimilaritiesAsListOfTriples(): List<Triple<String, String, Double>> {
return similarities.map { Triple(it.documents.first.name, it.documents.second.name, it.score) }
return documentSimilarities.map { Triple(it.documents.first.name, it.documents.second.name, it.score) }
}

fun retrieveSimilaritiesAsListsOfLists(): List<List<String>> {
return similarities.map { listOf(it.documents.first.name, it.documents.second.name, it.score.toString()) }
return documentSimilarities.map { listOf(it.documents.first.name, it.documents.second.name, it.score.toString()) }
}

fun useStemming() {
Expand All @@ -54,6 +58,16 @@ class SemanticCouplingCalculator(private val files: List<Map<String, String>>, p
useLemmatization = true
}

fun doNotUseLsi() {
useLsi = false
}

fun useLsi(dimensions: Int = numberOfLsiDimensions, maxEpochs: Int = maxLsiEpochs) {
useLsi = true
numberOfLsiDimensions = dimensions
maxLsiEpochs = maxEpochs
}

private fun parseFile(fileName: String, fileContents: String): Document {
return when (programmingLanguage) {
ProgrammingLanguage.JAVA -> JavaSourceCodeParser(naturalLanguage, fileName, fileContents, useLemmatization).parse()
Expand All @@ -62,5 +76,7 @@ class SemanticCouplingCalculator(private val files: List<Map<String, String>>, p

companion object Constants {
private val DefaultNaturalLanguage = NaturalLanguage.EN
private const val DefaultNumberOfLsiDimensions: Int = LatentSemanticIndexer.NumberOfDimensions
private const val DefaultMaxLsiEpochs: Int = LatentSemanticIndexer.MaxEpochs
}
}

0 comments on commit b9639de

Please sign in to comment.