Merge branch 'develop' into feat/ggml-update

rustformers · Nov 12, 2023 · 5e4b35f · 5e4b35f
2 parents fcbfb4d + 52c2bb6
commit 5e4b35f
Show file tree

Hide file tree

Showing 16 changed files with 629 additions and 97 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
  "crates/llm",
  "crates/llm-base",
  "crates/models/*",
- "binaries/*"
+ "binaries/*",
 ]
 resolver = "2"
 default-members = ["binaries/llm-cli", "crates/llm"]
@@ -27,12 +27,12 @@ anyhow = "1.0"
 rustyline = { version = "11.0.0", features = ["derive"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = { version = "1.0" }
-spinoff = { version = "0.7.0", default-features = false, features = ["dots2"] }
+spinoff = { version = "0.8.0", default-features = false, features = ["dots2"] }
 clap = { version = "4.1.8", features = ["derive"] }
 memmap2 = "0.5.10"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 tracing = { version = "0.1", features = ["log"] }
-llm-samplers = "=0.0.6"
+llm-samplers = "=0.0.7"
 
 # Config for 'cargo dist'
 [workspace.metadata.dist]
@@ -45,7 +45,12 @@ ci = ["github"]
 # The installers to generate for each app
 installers = ["shell", "powershell"]
 # Target platforms to build apps for (Rust target-triple syntax)
-targets = ["x86_64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-pc-windows-msvc", "aarch64-apple-darwin"]
+targets = [
+ "x86_64-unknown-linux-gnu",
+ "x86_64-apple-darwin",
+ "x86_64-pc-windows-msvc",
+ "aarch64-apple-darwin",
+]
 
 # The profile that 'cargo dist' will build with
 [profile.dist]

diff --git a/README.md b/README.md
@@ -287,6 +287,7 @@ Absolutely! Please see the [contributing guide](./doc/CONTRIBUTING.md).
  inference API on your local machine using `llm`.
 - [secondbrain](https://github.com/juliooa/secondbrain): Desktop app to download and run LLMs locally in your computer using `llm`.
 - [floneum](https://floneum.com/): A graph editor for local AI workflows.
+- [poly](https://github.com/pixelspark/poly): A versatile LLM serving back-end with tasks, streaming completion, memory retrieval, and more.
 
 #### Libraries
 

diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
@@ -290,6 +290,15 @@ pub struct Generate {
  /// top_p - The probability for the top tokens are added until the result is greater or equal to P and at least min_keep tokens have been seen.
  /// p(0.95): The cumulative probability after which no more tokens are kept for sampling.
  /// min_keep(1): Minimum tokens to keep. Setting this to 0 is not recommended.
+ ///
+ /// top_a (default: disabled) - This sampler prunes tokens that don't meet a threshold based on the most probable token. The formula is `a1 * pow(max_prob, a2)`. See https://github.com/BlinkDL/RWKV-LM#the-top-a-sampling-method for more information.
+ /// a1(0.0): Threshold scale. A reasonable value is 0.2. Setting either a1 or a2 to 0 disables the sampler.
+ /// a2(0.0): Threshold power. A reasonable value is 2.
+ /// min_keep(1): Minimum tokens to keep. Setting this to 0 is not recommended.
+ ///
+ /// min_p (default: disabled) - This sampler prunes tokens that don't meet a certain percentage of the most probable token. For example if `p` is `0.05` then after `min_keep` is satisfied, other tokens must be at least 5% of the most probable token. See https://github.com/ggerganov/llama.cpp/issues/3483 for more information.
+ /// p(0.0): Probability threshold. 0.05 to 0.2 are good starting values to try. Setting this to 0 disables the sampler.
+ /// min_keep(1): Minimum tokens to keep. Setting this to 0 is not recommended.
  #[arg(long = "sampler", short = 's', verbatim_doc_comment)]
  pub sampler_options: Vec<String>,
 
@@ -533,7 +542,7 @@ impl ModelLoad {
  let tokenizer_source = match self.model_and_tokenizer.to_source() {
  Ok(vs) => vs,
  Err(err) => {
- if let Some(sp) = sp.take() {
+ if let Some(mut sp) = sp.take() {
  sp.fail(&format!("Failed to load tokenizer: {}", err));
  }
  return Err(err);
@@ -586,7 +595,7 @@ impl ModelLoad {
  file_size,
  tensor_count,
  } => {
- if let Some(sp) = sp.take() {
+ if let Some(mut sp) = sp.take() {
  sp.success(&format!(
  "Loaded {tensor_count} tensors ({}) after {}ms",
  bytesize::to_string(file_size, false),
@@ -601,7 +610,7 @@ impl ModelLoad {
  if model.is_err() {
  // If we've failed at loading the model, we probably haven't stopped the spinner yet.
  // Cancel it now if needed.
- if let Some(sp) = sp {
+ if let Some(mut sp) = sp {
  sp.fail("Failed to load model")
  }
  }

diff --git a/binaries/llm-cli/src/interactive.rs b/binaries/llm-cli/src/interactive.rs
@@ -141,7 +141,7 @@ fn feed_prompt_with_spinner(
  prompt.insert(0, '\n');
  }
 
- let sp = spinoff::Spinner::new(spinoff::spinners::Dots2, "".to_string(), None);
+ let mut sp = spinoff::Spinner::new(spinoff::spinners::Dots2, "".to_string(), None);
  let result = session.feed_prompt(
  model,
  &prompt,

diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs
@@ -92,14 +92,14 @@ fn run_inference(
 // Takes the most likely element from the logits, except if they've appeared in `previous_tokens`
 // at all
 #[derive(Debug, Default)]
-struct DeterministicSampler(SampleGreedy<TokenId>);
+struct DeterministicSampler(SampleGreedy);
 
-impl Sampler<TokenId, f32> for DeterministicSampler {
+impl Sampler for DeterministicSampler {
  fn sample<'a>(
  &mut self,
- res: &mut dyn HasSamplerResources<TokenId = TokenId>,
- logits: &'a mut Logits<TokenId, f32>,
- ) -> anyhow::Result<&'a mut Logits<TokenId, f32>> {
+ res: &mut dyn HasSamplerResources,
+ logits: &'a mut Logits,
+ ) -> anyhow::Result<&'a mut Logits> {
  let mut flat_bias = Default::default();
 
  // This might look a little weird, but it's necessary because the resource