Updated to use new embedding lib & smile pure

londogard · May 18, 2020 · dd692f4 · dd692f4
1 parent 3896fdc
commit dd692f4
Show file tree

Hide file tree

Showing 8 changed files with 20 additions and 310 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -4,12 +4,12 @@ import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
 plugins {
  `maven-publish`
  id("org.jetbrains.dokka") version "0.9.17"
- id("com.github.ben-manes.versions") version "0.27.0"
- kotlin("jvm") version "1.3.60"
+ kotlin("jvm") version "1.3.72"
 }
 
 group = "com.londogard"
-version = "1.1.1"
+version = "1.2.0"
+val smileVersion = "2.4.0"
 
 repositories {
  mavenCentral()
@@ -18,7 +18,9 @@ repositories {
 
 dependencies {
  implementation(kotlin("stdlib-jdk8"))
- api("com.londogard:smile-nlp-kt:1.0.1-beta")
+ implementation("com.londogard:embeddings-kt:master-SNAPSHOT")
+ implementation("com.github.haifengl:smile-nlp:$smileVersion")
+ implementation("com.github.haifengl:smile-kotlin:$smileVersion")
 
  testImplementation("junit:junit:4.12")
 }
@@ -27,10 +29,6 @@ tasks.withType<KotlinCompile> {
  kotlinOptions.jvmTarget = "1.8"
 }
 
-tasks.withType<KotlinCompile> {
- kotlinOptions.jvmTarget = "1.8"
-}
-
 tasks.dokka {
  outputFormat = "html"
  outputDirectory = "$buildDir/javadoc"

diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/DownloadHelper.kt b/src/main/kotlin/com/londogard/summarize/embeddings/DownloadHelper.kt
diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt b/src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt
diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/LightWordEmbeddings.kt b/src/main/kotlin/com/londogard/summarize/embeddings/LightWordEmbeddings.kt
diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/WordEmbeddings.kt b/src/main/kotlin/com/londogard/summarize/embeddings/WordEmbeddings.kt
diff --git a/src/main/kotlin/com/londogard/summarize/extensions/ArrayHelper.kt b/src/main/kotlin/com/londogard/summarize/extensions/ArrayHelper.kt
@@ -34,9 +34,9 @@ internal fun Array<Float>.normalize(): Array<Float> = when {
  }
 }
 
-internal fun Iterable<Array<Float>>.sumByColumns(): Array<Float> = reduce { agg, vector -> agg + vector }
+internal fun Iterable<Array<Float>>.sumByColumns(): Array<Float> = reduce { agg, vector -> agg `++` vector }
 
-internal fun List<List<Double>>.mutableSumByCols(): List<Double> {
+internal fun List<DoubleArray>.mutableSumByCols(): List<Double> {
  val columnSum = MutableList(this[0].size) { 0.0 }
  for (columns in this)
  for (i in columns.indices)

diff --git a/src/main/kotlin/com/londogard/summarize/summarizers/EmbeddingClusterSummarizer.kt b/src/main/kotlin/com/londogard/summarize/summarizers/EmbeddingClusterSummarizer.kt
@@ -1,11 +1,10 @@
 package com.londogard.summarize.summarizers
 
-import com.londogard.smile.SmileOperators
-import com.londogard.smile.extensions.*
+import com.londogard.embeddings.LightWordEmbeddings
 import com.londogard.summarize.extensions.*
-import com.londogard.summarize.embeddings.LightWordEmbeddings
 import com.londogard.summarize.extensions.mutableSumByCols
 import com.londogard.summarize.extensions.normalize
+import smile.nlp.*
 import kotlin.math.min
 import kotlin.math.roundToInt
 
@@ -19,7 +18,7 @@ internal class EmbeddingClusterSummarizer(
  private val simThreshold: Double,
  private val config: ScoringConfig,
  embeddingOverload: Pair<String, Int>?
-) : SmileOperators, Summarizer {
+) : Summarizer {
  private val embeddings = embeddingOverload
  ?.let { (path, dim) -> LightWordEmbeddings(dim, path) } ?: LightWordEmbeddings()
  private val zeroArray = Array(embeddings.dimensions) { 0f }
@@ -29,7 +28,7 @@ internal class EmbeddingClusterSummarizer(
  private fun getWordsAboveTfIdfThreshold(sentences: List<String>): Set<String> {
  val corpus = sentences.map { it.bag(stemmer = null) }
  val words = corpus.flatMap { bag -> bag.keys }.distinct()
- val bags = corpus.map { vectorize(words, it) }
+ val bags = corpus.map { vectorize(words.toTypedArray(), it) }
  val vectors = tfidf(bags)
  val vector = vectors.mutableSumByCols()
  val vecMax = vector.max() ?: 1.0
@@ -41,7 +40,7 @@ internal class EmbeddingClusterSummarizer(
  }
 
 
- private fun getWordVector(words: List<String>, allowedWords: Set<String>): Array<Float> = words
+ private fun getWordVector(words: Array<String>, allowedWords: Set<String>): Array<Float> = words
  .filter(allowedWords::contains)
  .fold(zeroArray) { acc, word -> (embeddings.vector(word) ?: zeroArray) `++` acc }
  .normalize()
@@ -133,8 +132,9 @@ internal class EmbeddingClusterSummarizer(
  val wordsOfInterest = getWordsAboveTfIdfThreshold(superCleanSentences)
  embeddings.addWords(wordsOfInterest)
 
- val centroidVector = getWordVector(superCleanSentences.flatMap { it.words() }, wordsOfInterest)
- val scores = getSentenceBaseScoring(superCleanSentences, sentences, centroidVector, wordsOfInterest)
+ val words = superCleanSentences.flatMap { it.words().toList() }
+ val centroidVector = getWordVector(words.toTypedArray(), wordsOfInterest)
+ val scores = getSentenceBaseScoring(superCleanSentences, sentences.toList(), centroidVector, wordsOfInterest)
  val finalSentences = when (config) {
  ScoringConfig.Ghalandari -> scoreGhalandari(lines, centroidVector, scores)
  ScoringConfig.Rosselio -> scoreRosellio(lines, scores)