Skip to content

Commit

Permalink
Updated to use new embedding lib & smile pure
Browse files Browse the repository at this point in the history
  • Loading branch information
Lundez committed May 18, 2020
1 parent 3896fdc commit dd692f4
Show file tree
Hide file tree
Showing 8 changed files with 20 additions and 310 deletions.
14 changes: 6 additions & 8 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
plugins {
`maven-publish`
id("org.jetbrains.dokka") version "0.9.17"
id("com.github.ben-manes.versions") version "0.27.0"
kotlin("jvm") version "1.3.60"
kotlin("jvm") version "1.3.72"
}

group = "com.londogard"
version = "1.1.1"
version = "1.2.0"
val smileVersion = "2.4.0"

repositories {
mavenCentral()
Expand All @@ -18,7 +18,9 @@ repositories {

dependencies {
implementation(kotlin("stdlib-jdk8"))
api("com.londogard:smile-nlp-kt:1.0.1-beta")
implementation("com.londogard:embeddings-kt:master-SNAPSHOT")
implementation("com.github.haifengl:smile-nlp:$smileVersion")
implementation("com.github.haifengl:smile-kotlin:$smileVersion")

testImplementation("junit:junit:4.12")
}
Expand All @@ -27,10 +29,6 @@ tasks.withType<KotlinCompile> {
kotlinOptions.jvmTarget = "1.8"
}

tasks.withType<KotlinCompile> {
kotlinOptions.jvmTarget = "1.8"
}

tasks.dokka {
outputFormat = "html"
outputDirectory = "$buildDir/javadoc"
Expand Down

This file was deleted.

99 changes: 0 additions & 99 deletions src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ internal fun Array<Float>.normalize(): Array<Float> = when {
}
}

internal fun Iterable<Array<Float>>.sumByColumns(): Array<Float> = reduce { agg, vector -> agg + vector }
internal fun Iterable<Array<Float>>.sumByColumns(): Array<Float> = reduce { agg, vector -> agg `++` vector }

internal fun List<List<Double>>.mutableSumByCols(): List<Double> {
internal fun List<DoubleArray>.mutableSumByCols(): List<Double> {
val columnSum = MutableList(this[0].size) { 0.0 }
for (columns in this)
for (i in columns.indices)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
package com.londogard.summarize.summarizers

import com.londogard.smile.SmileOperators
import com.londogard.smile.extensions.*
import com.londogard.embeddings.LightWordEmbeddings
import com.londogard.summarize.extensions.*
import com.londogard.summarize.embeddings.LightWordEmbeddings
import com.londogard.summarize.extensions.mutableSumByCols
import com.londogard.summarize.extensions.normalize
import smile.nlp.*
import kotlin.math.min
import kotlin.math.roundToInt

Expand All @@ -19,7 +18,7 @@ internal class EmbeddingClusterSummarizer(
private val simThreshold: Double,
private val config: ScoringConfig,
embeddingOverload: Pair<String, Int>?
) : SmileOperators, Summarizer {
) : Summarizer {
private val embeddings = embeddingOverload
?.let { (path, dim) -> LightWordEmbeddings(dim, path) } ?: LightWordEmbeddings()
private val zeroArray = Array(embeddings.dimensions) { 0f }
Expand All @@ -29,7 +28,7 @@ internal class EmbeddingClusterSummarizer(
private fun getWordsAboveTfIdfThreshold(sentences: List<String>): Set<String> {
val corpus = sentences.map { it.bag(stemmer = null) }
val words = corpus.flatMap { bag -> bag.keys }.distinct()
val bags = corpus.map { vectorize(words, it) }
val bags = corpus.map { vectorize(words.toTypedArray(), it) }
val vectors = tfidf(bags)
val vector = vectors.mutableSumByCols()
val vecMax = vector.max() ?: 1.0
Expand All @@ -41,7 +40,7 @@ internal class EmbeddingClusterSummarizer(
}


private fun getWordVector(words: List<String>, allowedWords: Set<String>): Array<Float> = words
private fun getWordVector(words: Array<String>, allowedWords: Set<String>): Array<Float> = words
.filter(allowedWords::contains)
.fold(zeroArray) { acc, word -> (embeddings.vector(word) ?: zeroArray) `++` acc }
.normalize()
Expand Down Expand Up @@ -133,8 +132,9 @@ internal class EmbeddingClusterSummarizer(
val wordsOfInterest = getWordsAboveTfIdfThreshold(superCleanSentences)
embeddings.addWords(wordsOfInterest)

val centroidVector = getWordVector(superCleanSentences.flatMap { it.words() }, wordsOfInterest)
val scores = getSentenceBaseScoring(superCleanSentences, sentences, centroidVector, wordsOfInterest)
val words = superCleanSentences.flatMap { it.words().toList() }
val centroidVector = getWordVector(words.toTypedArray(), wordsOfInterest)
val scores = getSentenceBaseScoring(superCleanSentences, sentences.toList(), centroidVector, wordsOfInterest)
val finalSentences = when (config) {
ScoringConfig.Ghalandari -> scoreGhalandari(lines, centroidVector, scores)
ScoringConfig.Rosselio -> scoreRosellio(lines, scores)
Expand Down
Loading

0 comments on commit dd692f4

Please sign in to comment.