-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add RAM-less Mode & improve API (#2)
- Improves speed & RAM usage if false for short time - Added Download - Added caching to low-RAM model - Fixed cache to keep overflow words that are not in embeddings - Updated README to show new API - Bumping version to 1.1.0
- Loading branch information
Showing
14 changed files
with
291 additions
and
188 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ plugins { | |
} | ||
|
||
group = "com.londogard" | ||
version = "1.0.1" | ||
version = "1.1.0" | ||
|
||
repositories { | ||
mavenCentral() | ||
|
This file was deleted.
Oops, something went wrong.
57 changes: 57 additions & 0 deletions
57
src/main/kotlin/com/londogard/summarize/embeddings/DownloadHelper.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
package com.londogard.summarize.embeddings | ||
|
||
import java.io.File | ||
import java.io.FileOutputStream | ||
import java.net.URL | ||
import java.nio.file.Files | ||
import java.util.zip.ZipFile | ||
|
||
|
||
object DownloadHelper { | ||
val embeddingDirPath: String = "${System.getProperty("user.home")}${File.separator}summarize-embeddings" | ||
val embeddingPath: String = "$embeddingDirPath${File.separator}glove.6B.50d.txt" | ||
|
||
fun embeddingsExist(): Boolean = File(embeddingDirPath).let { | ||
it.exists() && it.isDirectory && it.listFiles()?.asList()?.isNotEmpty() == true | ||
} | ||
|
||
private fun String.saveTo(path: String) { | ||
URL(this).openStream().use { input -> | ||
FileOutputStream(File(path)).use { output -> | ||
input.copyTo(output) | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* 1. Download to temp directory | ||
* 2. Extract embeddings into 'summarize-embeddings' which is placed in root of users home folder. | ||
*/ | ||
fun downloadGloveEmbeddings() { | ||
if (embeddingsExist()) { | ||
println("Embeddings exist in path $embeddingDirPath, early exiting...") | ||
return | ||
} | ||
|
||
val tempFile = Files.createTempFile("glove", ".zip") | ||
val tempPath = tempFile.toAbsolutePath().toString() | ||
val customDir = File(embeddingDirPath) | ||
|
||
if (!customDir.exists()) customDir.mkdir() | ||
|
||
println("Downloading X GB of Glove Word Embeddings (this will take a while, ~1 GB)...") | ||
"http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip".saveTo(tempPath) | ||
println("Download done!") | ||
println("Extracting 50d word embeddings (from $tempPath to $customDir). Extract your own if you want larger.") | ||
ZipFile(tempPath).use { zip -> | ||
zip.entries().asSequence() | ||
.filter { it.name.contains("50d") } | ||
.forEach { entry -> | ||
zip.getInputStream(entry).use { input -> | ||
File(customDir.absolutePath + File.separator + entry.name).outputStream() | ||
.use { output -> input.copyTo(output) } | ||
} | ||
} | ||
} | ||
} | ||
} |
99 changes: 99 additions & 0 deletions
99
src/main/kotlin/com/londogard/summarize/embeddings/Embeddings.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
package com.londogard.summarize.embeddings | ||
|
||
import com.londogard.summarize.extensions.`--` | ||
import com.londogard.summarize.extensions.dot | ||
import com.londogard.summarize.extensions.normalize | ||
import java.nio.file.Files | ||
import java.nio.file.Paths | ||
import kotlin.math.sqrt | ||
import kotlin.streams.asSequence | ||
|
||
abstract class Embeddings { | ||
abstract val dimensions: Int | ||
abstract val delimiter: Char | ||
abstract val normalized: Boolean | ||
abstract val filename: String | ||
internal abstract val embeddings: Map<String, Array<Float>> | ||
|
||
/** Number of words */ | ||
val numWords by lazy { embeddings.keys } | ||
|
||
/** Check if the word is present in the vocab map. | ||
* @param word Word to be checked. | ||
* @return True if the word is in the vocab map. | ||
*/ | ||
fun contains(word: String): Boolean = embeddings.contains(word) | ||
|
||
/** Get the vector representation for the word. | ||
* @param word Word to retrieve vector for. | ||
* @return The vector representation of the word. | ||
*/ | ||
fun vector(word: String): Array<Float>? = embeddings[word] | ||
|
||
/** Compute the Euclidean distance between the vector representations of the words. | ||
* @param w1 The first word. | ||
* @param w2 The other word. | ||
* @return The Euclidean distance between the vector representations of the words. | ||
*/ | ||
fun euclidean(w1: String, w2: String): Double? = traverseVectors(listOf(w1, w2))?.let { vectors -> | ||
if (vectors.size == 2) euclidean(vectors.first(), vectors.last()) | ||
else null | ||
} | ||
|
||
/** Compute the Euclidean distance between two vectors. | ||
* @param v1 The first vector. | ||
* @param v2 The other vector. | ||
* @return The Euclidean distance between the two vectors. | ||
*/ | ||
fun euclidean(v1: Array<Float>, v2: Array<Float>): Double = | ||
(v1 `--` v2).let { vector -> sqrt(vector.dot(vector)) } | ||
|
||
/** Compute the cosine similarity score between two vectors. | ||
* 1.0 means equal, 0 = 90* & -1 is when they're opposite | ||
* @param v1 The first vector. | ||
* @param v2 The other vector. | ||
* @return The cosine similarity score of the two vectors. | ||
*/ | ||
fun cosine(v1: Array<Float>, v2: Array<Float>): Double { | ||
if (v1.size != v2.size) throw ArithmeticException("Vectors must be same size (v1: ${v1.size} != v2: ${v2.size}") | ||
|
||
return v1.dot(v2) / (sqrt(v1.dot(v1)) * sqrt(v2.dot(v2))) | ||
} | ||
|
||
/** Compute the cosine similarity score between the vector representations of the words. | ||
* @param w1 The first word. | ||
* @param w2 The other word. | ||
* @return The cosine similarity score between the vector representations of the words. | ||
*/ | ||
fun cosine(w1: String, w2: String): Double? = traverseVectors(listOf(w1, w2))?.let { vectors -> | ||
if (vectors.size == 2) cosine(vectors.first(), vectors.last()) | ||
else null | ||
} | ||
|
||
internal fun traverseVectors(words: List<String>): List<Array<Float>>? = words | ||
.fold(listOf<Array<Float>>() as List<Array<Float>>?) { agg, word -> | ||
vector(word)?.let { vector -> (agg ?: emptyList()) + listOf(vector) } | ||
} | ||
|
||
internal fun loadEmbeddingsFromFile(inFilter: Set<String> = emptySet()): Map<String, Array<Float>> = Files | ||
.newBufferedReader(Paths.get(filename)) | ||
.use { reader -> | ||
reader | ||
.lines() | ||
.filter { line -> inFilter.isEmpty() || inFilter.contains(line.takeWhile { it != delimiter }) } | ||
.asSequence() | ||
.mapNotNull { line -> | ||
line | ||
.split(delimiter) | ||
.takeIf { it.size > dimensions } | ||
?.let { elems -> | ||
val key = elems.first() | ||
val value = Array(dimensions) { i -> elems[i + 1].toFloat() } | ||
.let { if (normalized) it.normalize() else it } | ||
|
||
key to value | ||
} | ||
} | ||
.toMap() | ||
} | ||
} |
36 changes: 36 additions & 0 deletions
36
src/main/kotlin/com/londogard/summarize/embeddings/LightWordEmbeddings.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package com.londogard.summarize.embeddings | ||
|
||
class LightWordEmbeddings( | ||
override val dimensions: Int, | ||
override val filename: String = DownloadHelper.embeddingPath, | ||
override val delimiter: Char = ' ', | ||
override val normalized: Boolean = true, | ||
private val maxWordCount: Int = 1000 | ||
) : Embeddings() { | ||
/** Vocabulary, word to embedded space */ | ||
override val embeddings: MutableMap<String, Array<Float>> = mutableMapOf() | ||
private val keys: MutableSet<String> = mutableSetOf() | ||
|
||
init { | ||
if (filename == DownloadHelper.embeddingPath && !DownloadHelper.embeddingsExist()) | ||
DownloadHelper.downloadGloveEmbeddings() | ||
} | ||
|
||
fun addWords(words: Set<String>) { | ||
val leftToAdd = words - keys | ||
|
||
if (leftToAdd.isNotEmpty() && leftToAdd.size + keys.size > maxWordCount) { | ||
val toRemove = keys - words | ||
keys -= toRemove | ||
embeddings -= toRemove | ||
} | ||
|
||
if (leftToAdd.isNotEmpty()) loadEmbeddings(leftToAdd) | ||
} | ||
|
||
private fun loadEmbeddings(inFilter: Set<String>) { | ||
keys += inFilter | ||
val loadedEmbeddings = loadEmbeddingsFromFile(inFilter) | ||
embeddings.putAll(loadedEmbeddings) | ||
} | ||
} |
Oops, something went wrong.