diff --git a/build.gradle.kts b/build.gradle.kts index 02bd15c..340ea26 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,12 +4,12 @@ import org.jetbrains.kotlin.gradle.tasks.KotlinCompile plugins { `maven-publish` id("org.jetbrains.dokka") version "0.9.17" - id("com.github.ben-manes.versions") version "0.27.0" - kotlin("jvm") version "1.3.60" + kotlin("jvm") version "1.3.72" } group = "com.londogard" -version = "1.1.1" +version = "1.2.0" +val smileVersion = "2.4.0" repositories { mavenCentral() @@ -18,7 +18,9 @@ repositories { dependencies { implementation(kotlin("stdlib-jdk8")) - api("com.londogard:smile-nlp-kt:1.0.1-beta") + implementation("com.londogard:embeddings-kt:master-SNAPSHOT") + implementation("com.github.haifengl:smile-nlp:$smileVersion") + implementation("com.github.haifengl:smile-kotlin:$smileVersion") testImplementation("junit:junit:4.12") } @@ -27,10 +29,6 @@ tasks.withType { kotlinOptions.jvmTarget = "1.8" } -tasks.withType { - kotlinOptions.jvmTarget = "1.8" -} - tasks.dokka { outputFormat = "html" outputDirectory = "$buildDir/javadoc" diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/DownloadHelper.kt b/src/main/kotlin/com/londogard/summarize/embeddings/DownloadHelper.kt deleted file mode 100644 index 3273320..0000000 --- a/src/main/kotlin/com/londogard/summarize/embeddings/DownloadHelper.kt +++ /dev/null @@ -1,57 +0,0 @@ -package com.londogard.summarize.embeddings - -import java.io.File -import java.io.FileOutputStream -import java.net.URL -import java.nio.file.Files -import java.util.zip.ZipFile - -object DownloadHelper { - private val embeddingDirPath: String = "${System.getProperty("user.home")}${File.separator}summarize-embeddings" - val embeddingPath: String = "$embeddingDirPath${File.separator}glove.6B.50d.txt" - const val dimension: Int = 50 - - fun embeddingsExist(): Boolean = File(embeddingDirPath).let { - it.exists() && it.isDirectory && it.listFiles()?.asList()?.isNotEmpty() == true - } - - private fun String.saveTo(path: String) { - URL(this).openStream().use { input -> - FileOutputStream(File(path)).use { output -> - input.copyTo(output) - } - } - } - - /** - * 1. Download to temp directory - * 2. Extract embeddings into 'summarize-embeddings' which is placed in root of users home folder. - */ - fun downloadGloveEmbeddings() { - if (embeddingsExist()) { - println("Embeddings exist in path $embeddingDirPath, early exiting...") - return - } - - val tempFile = Files.createTempFile("glove", ".zip") - val tempPath = tempFile.toAbsolutePath().toString() - val customDir = File(embeddingDirPath) - - if (!customDir.exists()) customDir.mkdir() - - println("Downloading X GB of Glove Word Embeddings (this will take a while, ~1 GB)...") - "http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip".saveTo(tempPath) - println("Download done!") - println("Extracting 50d word embeddings (from $tempPath to $customDir). Extract your own if you want larger.") - ZipFile(tempPath).use { zip -> - zip.entries().asSequence() - .filter { it.name.contains("50d") } - .forEach { entry -> - zip.getInputStream(entry).use { input -> - File(customDir.absolutePath + File.separator + entry.name).outputStream() - .use { output -> input.copyTo(output) } - } - } - } - } -} \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt b/src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt deleted file mode 100644 index 0468028..0000000 --- a/src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt +++ /dev/null @@ -1,99 +0,0 @@ -package com.londogard.summarize.embeddings - -import com.londogard.summarize.extensions.`--` -import com.londogard.summarize.extensions.dot -import com.londogard.summarize.extensions.normalize -import java.nio.file.Files -import java.nio.file.Paths -import kotlin.math.sqrt -import kotlin.streams.asSequence - -abstract class Embeddings { - abstract val dimensions: Int - abstract val delimiter: Char - abstract val normalized: Boolean - abstract val filename: String - internal abstract val embeddings: Map> - - /** Number of words */ - val numWords by lazy { embeddings.keys } - - /** Check if the word is present in the vocab map. - * @param word Word to be checked. - * @return True if the word is in the vocab map. - */ - fun contains(word: String): Boolean = embeddings.contains(word) - - /** Get the vector representation for the word. - * @param word Word to retrieve vector for. - * @return The vector representation of the word. - */ - fun vector(word: String): Array? = embeddings[word] - - /** Compute the Euclidean distance between the vector representations of the words. - * @param w1 The first word. - * @param w2 The other word. - * @return The Euclidean distance between the vector representations of the words. - */ - fun euclidean(w1: String, w2: String): Double? = traverseVectors(listOf(w1, w2))?.let { vectors -> - if (vectors.size == 2) euclidean(vectors.first(), vectors.last()) - else null - } - - /** Compute the Euclidean distance between two vectors. - * @param v1 The first vector. - * @param v2 The other vector. - * @return The Euclidean distance between the two vectors. - */ - fun euclidean(v1: Array, v2: Array): Double = - (v1 `--` v2).let { vector -> sqrt(vector.dot(vector)) } - - /** Compute the cosine similarity score between two vectors. - * 1.0 means equal, 0 = 90* & -1 is when they're opposite - * @param v1 The first vector. - * @param v2 The other vector. - * @return The cosine similarity score of the two vectors. - */ - fun cosine(v1: Array, v2: Array): Double { - if (v1.size != v2.size) throw ArithmeticException("Vectors must be same size (v1: ${v1.size} != v2: ${v2.size}") - - return v1.dot(v2) / (sqrt(v1.dot(v1)) * sqrt(v2.dot(v2))) - } - - /** Compute the cosine similarity score between the vector representations of the words. - * @param w1 The first word. - * @param w2 The other word. - * @return The cosine similarity score between the vector representations of the words. - */ - fun cosine(w1: String, w2: String): Double? = traverseVectors(listOf(w1, w2))?.let { vectors -> - if (vectors.size == 2) cosine(vectors.first(), vectors.last()) - else null - } - - internal fun traverseVectors(words: List): List>? = words - .fold(listOf>() as List>?) { agg, word -> - vector(word)?.let { vector -> (agg ?: emptyList()) + listOf(vector) } - } - - internal fun loadEmbeddingsFromFile(inFilter: Set = emptySet()): Map> = Files - .newBufferedReader(Paths.get(filename)) - .use { reader -> - reader - .lines() - .filter { line -> inFilter.isEmpty() || inFilter.contains(line.takeWhile { it != delimiter }) } - .asSequence() - .mapNotNull { line -> - line - .split(delimiter) - .takeIf { it.size > dimensions } - ?.let { elems -> - val key = elems.first() - val value = Array(dimensions) { i -> elems[i + 1].toFloat() } - .let { if (normalized) it.normalize() else it } - - key to value - } - } - .toMap() - } -} \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/LightWordEmbeddings.kt b/src/main/kotlin/com/londogard/summarize/embeddings/LightWordEmbeddings.kt deleted file mode 100644 index 216816a..0000000 --- a/src/main/kotlin/com/londogard/summarize/embeddings/LightWordEmbeddings.kt +++ /dev/null @@ -1,36 +0,0 @@ -package com.londogard.summarize.embeddings - -class LightWordEmbeddings( - override val dimensions: Int = DownloadHelper.dimension, - override val filename: String = DownloadHelper.embeddingPath, - override val delimiter: Char = ' ', - override val normalized: Boolean = true, - private val maxWordCount: Int = 1000 -) : Embeddings() { - /** Vocabulary, word to embedded space */ - override val embeddings: MutableMap> = mutableMapOf() - private val keys: MutableSet = mutableSetOf() - - init { - if (filename == DownloadHelper.embeddingPath && !DownloadHelper.embeddingsExist()) - DownloadHelper.downloadGloveEmbeddings() - } - - fun addWords(words: Set) { - val leftToAdd = words - keys - - if (leftToAdd.isNotEmpty() && leftToAdd.size + keys.size > maxWordCount) { - val toRemove = keys - words - keys -= toRemove - embeddings -= toRemove - } - - if (leftToAdd.isNotEmpty()) loadEmbeddings(leftToAdd) - } - - private fun loadEmbeddings(inFilter: Set) { - keys += inFilter - val loadedEmbeddings = loadEmbeddingsFromFile(inFilter) - embeddings.putAll(loadedEmbeddings) - } -} \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/WordEmbeddings.kt b/src/main/kotlin/com/londogard/summarize/embeddings/WordEmbeddings.kt deleted file mode 100644 index 8457990..0000000 --- a/src/main/kotlin/com/londogard/summarize/embeddings/WordEmbeddings.kt +++ /dev/null @@ -1,92 +0,0 @@ -package com.londogard.summarize.embeddings - -import com.londogard.summarize.extensions.`++` -import com.londogard.summarize.extensions.`--` -import com.londogard.summarize.extensions.normalize -import com.londogard.summarize.extensions.sumByColumns - -class WordEmbeddings( - override val dimensions: Int = DownloadHelper.dimension, - override val filename: String = DownloadHelper.embeddingPath, - override val delimiter: Char = ' ', - override val normalized: Boolean = true -) : Embeddings() { - /** Vocabulary, word to embedded space */ - override val embeddings: Map> by lazy { loadEmbeddingsFromFile() } - - init { - if (filename == DownloadHelper.embeddingPath && !DownloadHelper.embeddingsExist()) - DownloadHelper.downloadGloveEmbeddings() - } - - /** Find N closest terms in the vocab to the given vector, using only words from the in-set (if defined) - * and excluding all words from the out-set (if non-empty). Although you can, it doesn't make much - * sense to define both in and out sets. - * @param vector The vector. - * @param inSet Set of words to consider. Specify None to use all words in the vocab (default behavior). - * @param outSet Set of words to exclude (default to empty). - * @param N The maximum number of terms to return (default to 40). - * @return The N closest terms in the vocab to the given vector and their associated cosine similarity scores. - */ - fun nearestNeighbours( - vector: Array, inSet: Set? = null, - outSet: Set = setOf(), N: Int = 40 - ): List> { - val inputWords = (inSet ?: embeddings.keys) - outSet - - return embeddings - .filterKeys(inputWords::contains) - .map { (k, v) -> k to cosine(vector, v) } - .sortedByDescending { (_, cosineDist) -> cosineDist } - .take(N) - } - - /** Find the N closest terms in the vocab to the input word(s). - * @param input The input word(s). - * @param N The maximum number of terms to return (default to 40). - * @return The N closest terms in the vocab to the input word(s) and their associated cosine similarity scores. - */ - fun distance(input: List, N: Int = 40): List>? = - traverseVectors(input) - ?.let { vectors -> nearestNeighbours(vectors.sumByColumns().normalize(), outSet = input.toSet(), N = N) } - - /** Find the N closest terms in the vocab to the analogy: - * - [w1] is to [w2] as [w3] is to ??? - * - * The algorithm operates as follow: - * - Find a vector approximation of the missing word = vec([w2]) - vec([w1]) + vec([w3]). - * - Return words closest to the approximated vector. - * - * @param w1 First word in the analogy [w1] is to [w2] as [w3] is to ???. - * @param w2 Second word in the analogy [w1] is to [w2] as [w3] is to ??? - * @param w3 Third word in the analogy [w1] is to [w2] as [w3] is to ???. - * @param N The maximum number of terms to return (default to 40). - * - * @return The N closest terms in the vocab to the analogy and their associated cosine similarity scores. - */ - fun analogy(w1: String, w2: String, w3: String, N: Int = 40): List>? = - traverseVectors(listOf(w1, w2, w3)) - ?.takeIf { it.size == 3 } - ?.let { vec -> - val vector = (vec[1] `--` vec[0]) `++` vec[2] - nearestNeighbours(vector.normalize(), outSet = setOf(w1, w2, w3), N = N) - } - - /** Rank a set of words by their respective distance to some central term. - * @param word The central word. - * @param set Set of words to rank. - * @return Ordered list of words and their associated scores. - */ - fun rank(word: String, set: Set): List> = - vector(word) - ?.let { vec -> nearestNeighbours(vec, inSet = set, N = set.size) } - ?: listOf() - - /** Pretty print the list of words and their associated scores. - * @param words List of (word, score) pairs to be printed. - */ - fun pprint(words: List>) { - println("\n%50s${" ".repeat(7)}Cosine distance\n${"-".repeat(72)}".format("Word")) - println(words.joinToString("\n") { (word, dist) -> "%50s${" ".repeat(7)}%15f".format(word, dist) }) - } -} \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/summarize/extensions/ArrayHelper.kt b/src/main/kotlin/com/londogard/summarize/extensions/ArrayHelper.kt index 73c761b..d9cf584 100644 --- a/src/main/kotlin/com/londogard/summarize/extensions/ArrayHelper.kt +++ b/src/main/kotlin/com/londogard/summarize/extensions/ArrayHelper.kt @@ -34,9 +34,9 @@ internal fun Array.normalize(): Array = when { } } -internal fun Iterable>.sumByColumns(): Array = reduce { agg, vector -> agg + vector } +internal fun Iterable>.sumByColumns(): Array = reduce { agg, vector -> agg `++` vector } -internal fun List>.mutableSumByCols(): List { +internal fun List.mutableSumByCols(): List { val columnSum = MutableList(this[0].size) { 0.0 } for (columns in this) for (i in columns.indices) diff --git a/src/main/kotlin/com/londogard/summarize/summarizers/EmbeddingClusterSummarizer.kt b/src/main/kotlin/com/londogard/summarize/summarizers/EmbeddingClusterSummarizer.kt index 1583bb6..84a3e52 100644 --- a/src/main/kotlin/com/londogard/summarize/summarizers/EmbeddingClusterSummarizer.kt +++ b/src/main/kotlin/com/londogard/summarize/summarizers/EmbeddingClusterSummarizer.kt @@ -1,11 +1,10 @@ package com.londogard.summarize.summarizers -import com.londogard.smile.SmileOperators -import com.londogard.smile.extensions.* +import com.londogard.embeddings.LightWordEmbeddings import com.londogard.summarize.extensions.* -import com.londogard.summarize.embeddings.LightWordEmbeddings import com.londogard.summarize.extensions.mutableSumByCols import com.londogard.summarize.extensions.normalize +import smile.nlp.* import kotlin.math.min import kotlin.math.roundToInt @@ -19,7 +18,7 @@ internal class EmbeddingClusterSummarizer( private val simThreshold: Double, private val config: ScoringConfig, embeddingOverload: Pair? -) : SmileOperators, Summarizer { +) : Summarizer { private val embeddings = embeddingOverload ?.let { (path, dim) -> LightWordEmbeddings(dim, path) } ?: LightWordEmbeddings() private val zeroArray = Array(embeddings.dimensions) { 0f } @@ -29,7 +28,7 @@ internal class EmbeddingClusterSummarizer( private fun getWordsAboveTfIdfThreshold(sentences: List): Set { val corpus = sentences.map { it.bag(stemmer = null) } val words = corpus.flatMap { bag -> bag.keys }.distinct() - val bags = corpus.map { vectorize(words, it) } + val bags = corpus.map { vectorize(words.toTypedArray(), it) } val vectors = tfidf(bags) val vector = vectors.mutableSumByCols() val vecMax = vector.max() ?: 1.0 @@ -41,7 +40,7 @@ internal class EmbeddingClusterSummarizer( } - private fun getWordVector(words: List, allowedWords: Set): Array = words + private fun getWordVector(words: Array, allowedWords: Set): Array = words .filter(allowedWords::contains) .fold(zeroArray) { acc, word -> (embeddings.vector(word) ?: zeroArray) `++` acc } .normalize() @@ -133,8 +132,9 @@ internal class EmbeddingClusterSummarizer( val wordsOfInterest = getWordsAboveTfIdfThreshold(superCleanSentences) embeddings.addWords(wordsOfInterest) - val centroidVector = getWordVector(superCleanSentences.flatMap { it.words() }, wordsOfInterest) - val scores = getSentenceBaseScoring(superCleanSentences, sentences, centroidVector, wordsOfInterest) + val words = superCleanSentences.flatMap { it.words().toList() } + val centroidVector = getWordVector(words.toTypedArray(), wordsOfInterest) + val scores = getSentenceBaseScoring(superCleanSentences, sentences.toList(), centroidVector, wordsOfInterest) val finalSentences = when (config) { ScoringConfig.Ghalandari -> scoreGhalandari(lines, centroidVector, scores) ScoringConfig.Rosselio -> scoreRosellio(lines, scores) diff --git a/src/main/kotlin/com/londogard/summarize/summarizers/TfIdfSummarizer.kt b/src/main/kotlin/com/londogard/summarize/summarizers/TfIdfSummarizer.kt index 5408a76..6819e5d 100644 --- a/src/main/kotlin/com/londogard/summarize/summarizers/TfIdfSummarizer.kt +++ b/src/main/kotlin/com/londogard/summarize/summarizers/TfIdfSummarizer.kt @@ -1,15 +1,11 @@ package com.londogard.summarize.summarizers -import com.londogard.smile.SmileOperators -import com.londogard.smile.extensions.bag -import com.londogard.smile.extensions.normalize -import com.londogard.smile.extensions.sentences -import com.londogard.smile.extensions.words import com.londogard.summarize.extensions.mutableSumByCols +import smile.nlp.* import kotlin.math.roundToInt -internal class TfIdfSummarizer : SmileOperators, Summarizer { - private fun getSentences(text: String): List = text.normalize().sentences() +internal class TfIdfSummarizer : Summarizer { + private fun getSentences(text: String): List = text.normalize().sentences().toList() override fun summarize(text: String, ratio: Double): String { val sentences = getSentences(text) @@ -21,7 +17,7 @@ internal class TfIdfSummarizer : SmileOperators, Summarizer { private fun summarize(sentences: List, lines: Int): String { val corpus = sentences.map { it.bag() } // bag includes stemming val words = corpus.flatMap { bag -> bag.keys }.distinct() - val bags = corpus.map { vectorize(words, it) } + val bags = corpus.map { vectorize(words.toTypedArray(), it) } val vectors = tfidf(bags) val vector = vectors.mutableSumByCols()