From c8d2efd3a13a93bac7cc01c38c4ab7b37d5bae2d Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 19 Sep 2024 12:09:37 -0700 Subject: [PATCH 1/7] WIP: Polymorphic span replacement --- src/shard.rs | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/src/shard.rs b/src/shard.rs index 526f4f4f..2120f0c0 100644 --- a/src/shard.rs +++ b/src/shard.rs @@ -467,7 +467,7 @@ impl Shard { } pub mod shard_config { - use crate::filters::Selector; + use crate::filters::{JqSelector, Selector}; use jsonpath_rust::JsonPathFinder; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -524,12 +524,18 @@ pub mod shard_config { pub syntax: Option, } + #[derive(Serialize, Deserialize, Clone)] + pub struct ReplacementConfig { + pub value: String, + pub selector: bool + } + #[derive(Serialize, Deserialize, Clone)] pub struct SpanReplacementConfig { pub span: String, pub min_score: Option, pub max_score: Option, - pub replacement: String, + pub replacement: ReplacementConfig, pub syntax: Option, } @@ -543,7 +549,33 @@ pub mod shard_config { selector: Selector, min_score: f64, max_score: f64, - replacement: String, + replacement: Replacement, + } + + pub enum Replacement { + Selectors(JqSelector), + String(String), + } + + impl Replacement { + pub fn new(config: &ReplacementConfig) -> Result { + if config.selector { + let selector = JqSelector::new(&config.value)?; + Ok(Replacement::Selectors(selector)) + } else { + Ok(Replacement::String(config.value.clone())) + } + } + + pub fn get(&self, json: &Value) -> Result { + match self { + Replacement::Selectors(selector) => { + let value = selector.select(json)?; + Ok(value.to_string()) + } + Replacement::String(s) => Ok(s.clone()), + } + } } impl SpanReplacer { @@ -553,7 +585,7 @@ pub mod shard_config { selector: Selector::new(&config).unwrap(), min_score: config.min_score.unwrap_or(f64::NEG_INFINITY), max_score: config.max_score.unwrap_or(f64::INFINITY), - replacement: config.replacement.clone(), + replacement: Replacement::new(&config.replacement).unwrap(), } } @@ -575,7 +607,7 @@ pub mod shard_config { let replacement = SpanReplacement { start: start as usize, end: end as usize, - replacement: self.replacement.clone(), + replacement: self.replacement.get(json).unwrap(), }; Some(replacement) } else { From b7407f97fb3ecb45de906bef6e3d106e52fea369 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 19 Sep 2024 12:28:43 -0700 Subject: [PATCH 2/7] fmt --- src/shard.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shard.rs b/src/shard.rs index 2120f0c0..34cd80c7 100644 --- a/src/shard.rs +++ b/src/shard.rs @@ -527,7 +527,7 @@ pub mod shard_config { #[derive(Serialize, Deserialize, Clone)] pub struct ReplacementConfig { pub value: String, - pub selector: bool + pub selector: bool, } #[derive(Serialize, Deserialize, Clone)] From c1f583c0bba68a4b0a3dc30ca116d28f85adbb7b Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 19 Sep 2024 15:19:27 -0700 Subject: [PATCH 3/7] Handle escaped chars in replacement --- Makefile | 2 +- pyproject.toml | 2 +- python/dolma/cli/mixer.py | 15 +++++++++++++-- src/shard.rs | 4 ++-- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index d7e2a73a..180b9400 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ setup: $(shell "${PROTOBUF_SETUP}") $(shell "${OPENSSL_SETUP}") which cargo || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - which maturin || pip install maturin[patchelf] + which maturin || pip install maturin publish: maturin publish diff --git a/pyproject.toml b/pyproject.toml index 13ff1ff7..cc48c430 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "1.0.12" +version = "1.0.13" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py index 8be8a0d9..ade9d03e 100644 --- a/python/dolma/cli/mixer.py +++ b/python/dolma/cli/mixer.py @@ -29,6 +29,14 @@ class FilterConfig: ) +@dataclass +class ReplacementConfig: + value: str = field( + default="", help="Value to replace the matched field with, if selector true this should be a jq selector" + ) + selector: bool = field(default=False, help="Whether to use field selector for the replacement") + + @dataclass class SpanReplacementConfig: span: str = field(help="JSONPath expression for the span to replace") @@ -40,7 +48,7 @@ class SpanReplacementConfig: default=None, help="Maximum score for the span to be replaced. Either min_score or max_score must be specified.", ) - replacement: str = field(default="", help="Replacement for the span") + replacement: ReplacementConfig = field(default=ReplacementConfig(), help="Replacement config for the span(s).") syntax: str = field( default="jsonpath", help="Syntax to use for filter expressions. Currently only JSONPath is supported. Defaults to JSONPath.", @@ -131,7 +139,10 @@ def run(cls, parsed_config: MixerConfig): stream_config_dict.setdefault("span_replacement", []).append( { "span": str(span_replacement.span), - "replacement": str(span_replacement.replacement), + "replacement": { + "value": span_replacement.replacement.value, + "selector": span_replacement.replacement.selector, + }, "syntax": span_replacement.syntax, **min_score_config, **max_score_config, diff --git a/src/shard.rs b/src/shard.rs index 34cd80c7..f6f3c865 100644 --- a/src/shard.rs +++ b/src/shard.rs @@ -399,9 +399,9 @@ impl Shard { ); new_text.push_str(&replacement_text); } - data["text"] = Value::String(new_text); + data["text"] = serde_json::from_str(&new_text)?; } - // } + for f in self.discard_fields.iter().flatten() { data.as_object_mut().unwrap().remove(f); } From 443205eb4edfa615bec4c7e4a3cc35ebc72fc375 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 19 Sep 2024 17:19:11 -0700 Subject: [PATCH 4/7] Fixes --- python/dolma/cli/mixer.py | 15 ++------------- src/shard.rs | 24 ++++++++++-------------- 2 files changed, 12 insertions(+), 27 deletions(-) diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py index ade9d03e..943d7f74 100644 --- a/python/dolma/cli/mixer.py +++ b/python/dolma/cli/mixer.py @@ -29,14 +29,6 @@ class FilterConfig: ) -@dataclass -class ReplacementConfig: - value: str = field( - default="", help="Value to replace the matched field with, if selector true this should be a jq selector" - ) - selector: bool = field(default=False, help="Whether to use field selector for the replacement") - - @dataclass class SpanReplacementConfig: span: str = field(help="JSONPath expression for the span to replace") @@ -48,7 +40,7 @@ class SpanReplacementConfig: default=None, help="Maximum score for the span to be replaced. Either min_score or max_score must be specified.", ) - replacement: ReplacementConfig = field(default=ReplacementConfig(), help="Replacement config for the span(s).") + replacement: str = field(default="", help="Replacement config for the span(s).") syntax: str = field( default="jsonpath", help="Syntax to use for filter expressions. Currently only JSONPath is supported. Defaults to JSONPath.", @@ -139,10 +131,7 @@ def run(cls, parsed_config: MixerConfig): stream_config_dict.setdefault("span_replacement", []).append( { "span": str(span_replacement.span), - "replacement": { - "value": span_replacement.replacement.value, - "selector": span_replacement.replacement.selector, - }, + "replacement": str(span_replacement.replacement), "syntax": span_replacement.syntax, **min_score_config, **max_score_config, diff --git a/src/shard.rs b/src/shard.rs index f6f3c865..d5992caa 100644 --- a/src/shard.rs +++ b/src/shard.rs @@ -399,7 +399,8 @@ impl Shard { ); new_text.push_str(&replacement_text); } - data["text"] = serde_json::from_str(&new_text)?; + + data["text"] = Value::String(new_text); } for f in self.discard_fields.iter().flatten() { @@ -524,18 +525,12 @@ pub mod shard_config { pub syntax: Option, } - #[derive(Serialize, Deserialize, Clone)] - pub struct ReplacementConfig { - pub value: String, - pub selector: bool, - } - #[derive(Serialize, Deserialize, Clone)] pub struct SpanReplacementConfig { pub span: String, pub min_score: Option, pub max_score: Option, - pub replacement: ReplacementConfig, + pub replacement: String, pub syntax: Option, } @@ -558,20 +553,21 @@ pub mod shard_config { } impl Replacement { - pub fn new(config: &ReplacementConfig) -> Result { - if config.selector { - let selector = JqSelector::new(&config.value)?; + pub fn new(string: &str) -> Result { + // Note: Users should escape leading $ in replacement strings + if string.starts_with("$") { + // Strip leading $ and create a selector + let selector = JqSelector::new(&string[1..])?; Ok(Replacement::Selectors(selector)) } else { - Ok(Replacement::String(config.value.clone())) + Ok(Replacement::String(string.to_string())) } } pub fn get(&self, json: &Value) -> Result { match self { Replacement::Selectors(selector) => { - let value = selector.select(json)?; - Ok(value.to_string()) + Ok(serde_json::from_value(selector.select(json)?.to_owned()).unwrap()) } Replacement::String(s) => Ok(s.clone()), } From 18ab0df1de28275fc729fad79fb74799d4e2559c Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 20 Sep 2024 10:49:07 -0700 Subject: [PATCH 5/7] Docs and environment cleanup --- Makefile | 26 +------------------------- docs/mixer.md | 2 +- setup.sh | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 26 deletions(-) create mode 100755 setup.sh diff --git a/Makefile b/Makefile index 180b9400..eab71919 100644 --- a/Makefile +++ b/Makefile @@ -1,29 +1,5 @@ -UNAME := $(shell uname) - -ifeq ($(UNAME), Darwin) - OS_MESSAGE := "MacOS detected" - CMAKE_SETUP := "which cmake || brew install cmake" - PROTOBUF_SETUP := "which protoc || brew install protobuf" - OPENSSL_SETUP := "which openssl || brew install openssl" -else ifeq ($(UNAME), Linux) - OS_MESSAGE := "Linux detected" - CMAKE_SETUP := "which cmake || sudo apt-get install --yes build-essential cmake" - PROTOBUF_SETUP := "which protoc || sudo apt-get install --yes protobuf-compiler" - OPENSSL_SETUP := "which openssl || sudo apt-get install --yes libssl-dev" -else - OS_MESSAGE := "Unsupported OS; please install rust, cmake, protobuf, and openssl manually" - CMAKE_SETUP := "" - PROTOBUF_SETUP := "" - OPENSSL_SETUP := "" -endif - setup: - @echo "${OS_MESSAGE}: installing..." - $(shell "${CMAKE_SETUP}") - $(shell "${PROTOBUF_SETUP}") - $(shell "${OPENSSL_SETUP}") - which cargo || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - which maturin || pip install maturin + @./setup.sh publish: maturin publish diff --git a/docs/mixer.md b/docs/mixer.md index afa49fd6..6a20add0 100644 --- a/docs/mixer.md +++ b/docs/mixer.md @@ -25,7 +25,7 @@ The following parameters are supported either via CLI (e.g. `dolma mix --paramet |`streams[].span_replacement`|No| A list of objects specifying spans of text to be replaced. | |`streams[].span_replacement[].span`|No| A json-path expression for an attribute that contains an array of spans. Each span should be list of length three: `[start, end, score]`. | |`streams[].span_replacement[].min_score`|No| If the span score is less than this value, the span will not be replaced. | -|`streams[].span_replacement[].replacement`|No| The text that should be inserted in place of the span. Use `{}` to represent the original text. | +|`streams[].span_replacement[].replacement`|No| The text that should be inserted in place of the span. Use `{}` to represent the original text. Field selection from the document is also supported by prefixing a jq selector with `$`. Note: Escape a leading $ if you do not with to use jq selector pattern. | |`work_dir.input`|No| Path to a local scratch directory where temporary input files can be placed. If not provided, Dolma will make one for you and delete it upon completion. | |`work_dir.output`|No| Path to a local scratch directory where temporary output files can be placed. If not provided, Dolma will make one for you and delete it upon completion. | |`processes`|No| Number of processes to use for mixing. By default 1 process is used. | diff --git a/setup.sh b/setup.sh new file mode 100755 index 00000000..b71fa385 --- /dev/null +++ b/setup.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e + +UNAME="$(uname)" +PLATFORM="$(uname -m)" + +if [[ $UNAME == "Darwin" ]]; then + echo "MacOS detected..." + which cmake || brew install cmake + which protoc || brew install protobuf + which openssl || brew install openssl +elif [[ $UNAME == "Linux" ]]; then + echo "Linux detected..." + which cmake || sudo apt-get install --yes build-essential cmake + which protoc || sudo apt-get install --yes protobuf-compiler + which openssl || sudo apt-get install --yes libssl-dev +else + echo "Unsupported OS; please install rust, cmake, protobuf, maturin and openssl manually!" + exit 1 +fi + +which cargo || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + +if [[ $PLATFORM == "x86_64" ]]; then + echo "x86_64 detected..." + which maturin || pip install maturin[patchelf] +fi + +if [[ $PLATFORM = "aarch64" ]]; then + echo "aarch64 detected..." + which maturin || pip install maturin +fi + +if [[ $PLATFORM = "arm64" ]]; then + echo "arm64 detected..." + which maturin || pip install maturin +else + echo "Unsupported platform; please install maturin manually" + exit 0 +fi From 8806094bdb908ccaa094f40b06fcae2635c17de0 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 20 Sep 2024 10:56:13 -0700 Subject: [PATCH 6/7] Style --- setup.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.sh b/setup.sh index b71fa385..43dbb4ca 100755 --- a/setup.sh +++ b/setup.sh @@ -11,9 +11,9 @@ if [[ $UNAME == "Darwin" ]]; then which openssl || brew install openssl elif [[ $UNAME == "Linux" ]]; then echo "Linux detected..." - which cmake || sudo apt-get install --yes build-essential cmake - which protoc || sudo apt-get install --yes protobuf-compiler - which openssl || sudo apt-get install --yes libssl-dev + which cmake || sudo apt-get install --yes build-essential cmake + which protoc || sudo apt-get install --yes protobuf-compiler + which openssl || sudo apt-get install --yes libssl-dev else echo "Unsupported OS; please install rust, cmake, protobuf, maturin and openssl manually!" exit 1 @@ -23,7 +23,7 @@ which cargo || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s if [[ $PLATFORM == "x86_64" ]]; then echo "x86_64 detected..." - which maturin || pip install maturin[patchelf] + which maturin || pip install maturin[patchelf] fi if [[ $PLATFORM = "aarch64" ]]; then @@ -33,7 +33,7 @@ fi if [[ $PLATFORM = "arm64" ]]; then echo "arm64 detected..." - which maturin || pip install maturin + which maturin || pip install maturin else echo "Unsupported platform; please install maturin manually" exit 0 From 92ecf832e17c2801beaa42a437b816b1a9a1bff0 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 20 Sep 2024 11:00:33 -0700 Subject: [PATCH 7/7] Check for any arm platform --- setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.sh b/setup.sh index 43dbb4ca..0b4212a7 100755 --- a/setup.sh +++ b/setup.sh @@ -31,8 +31,8 @@ if [[ $PLATFORM = "aarch64" ]]; then which maturin || pip install maturin fi -if [[ $PLATFORM = "arm64" ]]; then - echo "arm64 detected..." +if [[ $PLATFORM = arm* ]]; then + echo "arm detected..." which maturin || pip install maturin else echo "Unsupported platform; please install maturin manually"