Add RAM-less Mode & improve API (#2)

- Improves speed & RAM usage if false for short time - Added Download - Added caching to low-RAM model - Fixed cache to keep overflow words that are not in embeddings - Updated README to show new API - Bumping version to 1.1.0
londogard · Feb 21, 2020 · dd42ff5 · dd42ff5
1 parent ce413b1
commit dd42ff5
Show file tree

Hide file tree

Showing 14 changed files with 291 additions and 188 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <a href='https://ko-fi.com/O5O819SEH' target='_blank'><img height='22' style='border:0px;height:22px;' src='https://az743702.vo.msecnd.net/cdn/kofi2.png?v=2' border='0' alt='Buy Me a Coffee at ko-fi.com' /></a>[![](https://jitpack.io/v/com.londogard/summarize-kt.svg)](https://jitpack.io/#com.londogard/summarize-kt)
 
 # summarize-kt
-Summarisation library with an easy-to-use API (pre-loaded models). Currently only extractive summarisation is supported.
+Summarization library with an easy-to-use API (pre-loaded models). Currently only extractive summarisation is supported.
 
 The layout:
 
@@ -13,10 +13,13 @@ The layout:
  2. [Github Packages](#github-packages)
 
 ## Usage
-There's a wrapper class called `Summarizer` that allows us to 
+There's an interface named `Summarizer` that allows us to select the method of summarization
+through its `companion object`. Two variants are available:
 
-1) select the type of `SummarizerModel` through `SummarizeVariant` parameter 
-2) summarize texts using said model. 
+1) `Summarizer.tfIdfSummarizer` 
+2) `Summarizer.embeddingClusterSummarizer(threshold: Double = 0.2, simThreshold: Double = 0.95, scoreConfig: ScoringConfig = ScoringConfig.Ghalandari)`
+
+Where we have two different scoring configurations for `embeddingClusterSummarizer`. Read more in this [section](#explanation-of-the-different-configs). 
 
 `Summarizer` has two important methods: 
 ```kotlin
@@ -27,15 +30,15 @@ Both methods returns the summary of the text, the first one returns X number of
 
 ##### Example where we'd return ~30% of the content
 ```kotlin
-val summarizer: Summarizer = Summarizer(TfIdf)
+val summarizer: Summarizer = Summarizer.tfIdfSummarizer
 val fullText = """
 ...Plenty of text...
 """
 val summary = summarizer.summarize(fullText, ratio = 0.3)
 ```
 
 ## Explanation of the different configs
-`Summarizer` currently support two different versions, either `TfIdf` or `EmbeddingCluster` as `SummarizeVariant`. 
+`Summarizer` currently support two different versions, either `TfIdf` or `EmbeddingCluster` where the latter has two different configs. 
 #### Term Frequency-Inverse Document Frequency (TFIDF)
 `TfIdf` uses [TfIdf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) to find the most important sentences and then retrieves those back.
 #### Embedding Cluster 
@@ -58,10 +61,8 @@ The approach is chosen by the `ScoringConfig` where the first approach is based
 In addition one can also set the TfIdf-threshold mentioned using the `threshold` and similarity-threshold 
 using `similarityThreshold`.
 
-**OBS** Currently Word Embeddings are not included, 
-download them from [here](http://nlp.stanford.edu/data/glove.6B.zip) and then supply the path to the 
-dimension you wish for. 
-In the future there'll be a working `DownloadHelper` that'll download the embeddings if they're missing.
+**OBS** if you want to use custom embeddings you'll currently have to fork the project.
+The emeddings should download if you don't have them (OBS: this takes ~1gb download, then 157mb on HDD).
 
 ## Installation
 The code is uploaded to two different repositories, both Jitpack.io and GitHub Packages.

diff --git a/build.gradle.kts b/build.gradle.kts
@@ -9,7 +9,7 @@ plugins {
 }
 
 group = "com.londogard"
-version = "1.0.1"
+version = "1.1.0"
 
 repositories {
  mavenCentral()

diff --git a/src/main/kotlin/com/londogard/summarize/Summarizer.kt b/src/main/kotlin/com/londogard/summarize/Summarizer.kt
diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/DownloadHelper.kt b/src/main/kotlin/com/londogard/summarize/embeddings/DownloadHelper.kt
@@ -0,0 +1,57 @@
+package com.londogard.summarize.embeddings
+
+import java.io.File
+import java.io.FileOutputStream
+import java.net.URL
+import java.nio.file.Files
+import java.util.zip.ZipFile
+
+
+object DownloadHelper {
+ val embeddingDirPath: String = "${System.getProperty("user.home")}${File.separator}summarize-embeddings"
+ val embeddingPath: String = "$embeddingDirPath${File.separator}glove.6B.50d.txt"
+
+ fun embeddingsExist(): Boolean = File(embeddingDirPath).let {
+ it.exists() && it.isDirectory && it.listFiles()?.asList()?.isNotEmpty() == true
+ }
+
+ private fun String.saveTo(path: String) {
+ URL(this).openStream().use { input ->
+ FileOutputStream(File(path)).use { output ->
+ input.copyTo(output)
+ }
+ }
+ }
+
+ /**
+ * 1. Download to temp directory
+ * 2. Extract embeddings into 'summarize-embeddings' which is placed in root of users home folder.
+ */
+ fun downloadGloveEmbeddings() {
+ if (embeddingsExist()) {
+ println("Embeddings exist in path $embeddingDirPath, early exiting...")
+ return
+ }
+
+ val tempFile = Files.createTempFile("glove", ".zip")
+ val tempPath = tempFile.toAbsolutePath().toString()
+ val customDir = File(embeddingDirPath)
+
+ if (!customDir.exists()) customDir.mkdir()
+
+ println("Downloading X GB of Glove Word Embeddings (this will take a while, ~1 GB)...")
+ "http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip".saveTo(tempPath)
+ println("Download done!")
+ println("Extracting 50d word embeddings (from $tempPath to $customDir). Extract your own if you want larger.")
+ ZipFile(tempPath).use { zip ->
+ zip.entries().asSequence()
+ .filter { it.name.contains("50d") }
+ .forEach { entry ->
+ zip.getInputStream(entry).use { input ->
+ File(customDir.absolutePath + File.separator + entry.name).outputStream()
+ .use { output -> input.copyTo(output) }
+ }
+ }
+ }
+ }
+}
diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt b/src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt
@@ -0,0 +1,99 @@
+package com.londogard.summarize.embeddings
+
+import com.londogard.summarize.extensions.`--`
+import com.londogard.summarize.extensions.dot
+import com.londogard.summarize.extensions.normalize
+import java.nio.file.Files
+import java.nio.file.Paths
+import kotlin.math.sqrt
+import kotlin.streams.asSequence
+
+abstract class Embeddings {
+ abstract val dimensions: Int
+ abstract val delimiter: Char
+ abstract val normalized: Boolean
+ abstract val filename: String
+ internal abstract val embeddings: Map<String, Array<Float>>
+
+ /** Number of words */
+ val numWords by lazy { embeddings.keys }
+
+ /** Check if the word is present in the vocab map.
+ * @param word Word to be checked.
+ * @return True if the word is in the vocab map.
+ */
+ fun contains(word: String): Boolean = embeddings.contains(word)
+
+ /** Get the vector representation for the word.
+ * @param word Word to retrieve vector for.
+ * @return The vector representation of the word.
+ */
+ fun vector(word: String): Array<Float>? = embeddings[word]
+
+ /** Compute the Euclidean distance between the vector representations of the words.
+ * @param w1 The first word.
+ * @param w2 The other word.
+ * @return The Euclidean distance between the vector representations of the words.
+ */
+ fun euclidean(w1: String, w2: String): Double? = traverseVectors(listOf(w1, w2))?.let { vectors ->
+ if (vectors.size == 2) euclidean(vectors.first(), vectors.last())
+ else null
+ }
+
+ /** Compute the Euclidean distance between two vectors.
+ * @param v1 The first vector.
+ * @param v2 The other vector.
+ * @return The Euclidean distance between the two vectors.
+ */
+ fun euclidean(v1: Array<Float>, v2: Array<Float>): Double =
+ (v1 `--` v2).let { vector -> sqrt(vector.dot(vector)) }
+
+ /** Compute the cosine similarity score between two vectors.
+ * 1.0 means equal, 0 = 90* & -1 is when they're opposite
+ * @param v1 The first vector.
+ * @param v2 The other vector.
+ * @return The cosine similarity score of the two vectors.
+ */
+ fun cosine(v1: Array<Float>, v2: Array<Float>): Double {
+ if (v1.size != v2.size) throw ArithmeticException("Vectors must be same size (v1: ${v1.size} != v2: ${v2.size}")
+
+ return v1.dot(v2) / (sqrt(v1.dot(v1)) * sqrt(v2.dot(v2)))
+ }
+
+ /** Compute the cosine similarity score between the vector representations of the words.
+ * @param w1 The first word.
+ * @param w2 The other word.
+ * @return The cosine similarity score between the vector representations of the words.
+ */
+ fun cosine(w1: String, w2: String): Double? = traverseVectors(listOf(w1, w2))?.let { vectors ->
+ if (vectors.size == 2) cosine(vectors.first(), vectors.last())
+ else null
+ }
+
+ internal fun traverseVectors(words: List<String>): List<Array<Float>>? = words
+ .fold(listOf<Array<Float>>() as List<Array<Float>>?) { agg, word ->
+ vector(word)?.let { vector -> (agg ?: emptyList()) + listOf(vector) }
+ }
+
+ internal fun loadEmbeddingsFromFile(inFilter: Set<String> = emptySet()): Map<String, Array<Float>> = Files
+ .newBufferedReader(Paths.get(filename))
+ .use { reader ->
+ reader
+ .lines()
+ .filter { line -> inFilter.isEmpty() || inFilter.contains(line.takeWhile { it != delimiter }) }
+ .asSequence()
+ .mapNotNull { line ->
+ line
+ .split(delimiter)
+ .takeIf { it.size > dimensions }
+ ?.let { elems ->
+ val key = elems.first()
+ val value = Array(dimensions) { i -> elems[i + 1].toFloat() }
+ .let { if (normalized) it.normalize() else it }
+
+ key to value
+ }
+ }
+ .toMap()
+ }
+}
diff --git a/src/main/kotlin/com/londogard/summarize/embeddings/LightWordEmbeddings.kt b/src/main/kotlin/com/londogard/summarize/embeddings/LightWordEmbeddings.kt
@@ -0,0 +1,36 @@
+package com.londogard.summarize.embeddings
+
+class LightWordEmbeddings(
+ override val dimensions: Int,
+ override val filename: String = DownloadHelper.embeddingPath,
+ override val delimiter: Char = ' ',
+ override val normalized: Boolean = true,
+ private val maxWordCount: Int = 1000
+) : Embeddings() {
+ /** Vocabulary, word to embedded space */
+ override val embeddings: MutableMap<String, Array<Float>> = mutableMapOf()
+ private val keys: MutableSet<String> = mutableSetOf()
+
+ init {
+ if (filename == DownloadHelper.embeddingPath && !DownloadHelper.embeddingsExist())
+ DownloadHelper.downloadGloveEmbeddings()
+ }
+
+ fun addWords(words: Set<String>) {
+ val leftToAdd = words - keys
+
+ if (leftToAdd.isNotEmpty() && leftToAdd.size + keys.size > maxWordCount) {
+ val toRemove = keys - words
+ keys -= toRemove
+ embeddings -= toRemove
+ }
+
+ if (leftToAdd.isNotEmpty()) loadEmbeddings(leftToAdd)
+ }
+
+ private fun loadEmbeddings(inFilter: Set<String>) {
+ keys += inFilter
+ val loadedEmbeddings = loadEmbeddingsFromFile(inFilter)
+ embeddings.putAll(loadedEmbeddings)
+ }
+}