diff --git a/datasets/0.8/huggingface-c4/metadata.json b/datasets/0.8/huggingface-c4/metadata.json deleted file mode 100644 index fc7246b47..000000000 --- a/datasets/0.8/huggingface-c4/metadata.json +++ /dev/null @@ -1,238 +0,0 @@ -{ - "@context": { - "@language": "en", - "@vocab": "https://schema.org/", - "column": "ml:column", - "data": { - "@id": "ml:data", - "@type": "@json" - }, - "dataBiases": "ml:dataBiases", - "dataCollection": "ml:dataCollection", - "dataType": { - "@id": "ml:dataType", - "@type": "@vocab" - }, - "dct": "http://purl.org/dc/terms/", - "extract": "ml:extract", - "field": "ml:field", - "fileProperty": "ml:fileProperty", - "format": "ml:format", - "includes": "ml:includes", - "isEnumeration": "ml:isEnumeration", - "jsonPath": "ml:jsonPath", - "ml": "http://mlcommons.org/schema/", - "parentField": "ml:parentField", - "path": "ml:path", - "personalSensitiveInformation": "ml:personalSensitiveInformation", - "recordSet": "ml:recordSet", - "references": "ml:references", - "regex": "ml:regex", - "repeated": "ml:repeated", - "replace": "ml:replace", - "sc": "https://schema.org/", - "separator": "ml:separator", - "source": "ml:source", - "subField": "ml:subField", - "transform": "ml:transform" - }, - "@type": "sc:Dataset", - "name": "c4", - "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", - "citation": "\n@article{2019t5,\n author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},\n title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},\n journal = {arXiv e-prints},\n year = {2019},\n archivePrefix = {arXiv},\n eprint = {1910.10683},\n}\n", - "license": "odc-by", - "url": "https://huggingface.co/datasets/c4", - "version": "0.0.0", - "distribution": [ - { - "@type": "sc:FileObject", - "name": "repo", - "description": "The Hugging Face git repository.", - "contentUrl": "https://huggingface.co/datasets/c4/tree/refs%2Fconvert%2Fparquet", - "encodingFormat": "git+https", - "sha256": "https://github.com/mlcommons/croissant/issues/80" - }, - { - "@type": "sc:FileSet", - "name": "parquet-files", - "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/datasets-server/parquet).", - "containedIn": "repo", - "encodingFormat": "application/x-parquet", - "includes": "*/*/*.parquet" - } - ], - "recordSet": [ - { - "@type": "ml:RecordSet", - "name": "en", - "description": "The en set of records in the dataset.", - "field": [ - { - "@type": "ml:Field", - "name": "text", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "text" - } - } - }, - { - "@type": "ml:Field", - "name": "timestamp", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "timestamp" - } - } - }, - { - "@type": "ml:Field", - "name": "url", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "url" - } - } - } - ] - }, - { - "@type": "ml:RecordSet", - "name": "realnewslike", - "description": "The realnewslike set of records in the dataset.", - "field": [ - { - "@type": "ml:Field", - "name": "text", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "text" - } - } - }, - { - "@type": "ml:Field", - "name": "timestamp", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "timestamp" - } - } - }, - { - "@type": "ml:Field", - "name": "url", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "url" - } - } - } - ] - }, - { - "@type": "ml:RecordSet", - "name": "en.noblocklist", - "description": "The en.noblocklist set of records in the dataset.", - "field": [ - { - "@type": "ml:Field", - "name": "text", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "text" - } - } - }, - { - "@type": "ml:Field", - "name": "timestamp", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "timestamp" - } - } - }, - { - "@type": "ml:Field", - "name": "url", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "url" - } - } - } - ] - }, - { - "@type": "ml:RecordSet", - "name": "en.noclean", - "description": "The en.noclean set of records in the dataset.", - "field": [ - { - "@type": "ml:Field", - "name": "text", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "text" - } - } - }, - { - "@type": "ml:Field", - "name": "timestamp", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "timestamp" - } - } - }, - { - "@type": "ml:Field", - "name": "url", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "distribution": "parquet-files", - "extract": { - "column": "url" - } - } - } - ] - } - ] -} diff --git a/datasets/0.8/huggingface-c4/output/en.jsonl b/datasets/0.8/huggingface-c4/output/en.jsonl deleted file mode 100644 index 17e17db7a..000000000 --- a/datasets/0.8/huggingface-c4/output/en.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"text": "Beginners BBQ Class Taking Place in Missoula!\nDo you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.\nHe will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.\nThe cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.", "timestamp": "2019-04-25T12:57:54Z", "url": "https://klyq.com/beginners-bbq-class-taking-place-in-missoula/"} diff --git a/datasets/1.0/huggingface-c4/metadata.json b/datasets/1.0/huggingface-c4/metadata.json index 8cdaf7e61..3769cf199 100644 --- a/datasets/1.0/huggingface-c4/metadata.json +++ b/datasets/1.0/huggingface-c4/metadata.json @@ -50,7 +50,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "citeAs": "\n@article{2019t5,\n author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},\n title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},\n journal = {arXiv e-prints},\n year = {2019},\n archivePrefix = {arXiv},\n eprint = {1910.10683},\n}\n", "license": "odc-by", - "url": "https://huggingface.co/datasets/c4", + "url": "https://huggingface.co/datasets/allenai/c4", "version": "0.0.0", "distribution": [ { @@ -58,7 +58,7 @@ "@id": "repo", "name": "repo", "description": "The Hugging Face git repository.", - "contentUrl": "https://huggingface.co/datasets/c4/tree/refs%2Fconvert%2Fparquet", + "contentUrl": "https://huggingface.co/datasets/allenai/c4/tree/refs%2Fconvert%2Fparquet", "encodingFormat": "git+https", "sha256": "https://github.com/mlcommons/croissant/issues/80" }, @@ -77,172 +77,31 @@ "recordSet": [ { "@type": "cr:RecordSet", - "@id": "en", - "name": "en", + "@id": "data", + "name": "data", "description": "The en set of records in the dataset.", "field": [ { "@type": "cr:Field", - "@id": "en/text", - "name": "text", - "description": "Column from Hugging Face parquet file.", + "@id": "variant", + "name": "variant", + "description": "The name of the variant (e.g., en or en.noclean).", "dataType": "sc:Text", "source": { "fileSet": { "@id": "parquet-files" }, "extract": { - "column": "text" - } - } - }, - { - "@type": "cr:Field", - "@id": "en/timestamp", - "name": "timestamp", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "fileSet": { - "@id": "parquet-files" + "fileProperty": "fullpath" }, - "extract": { - "column": "timestamp" + "transform": { + "regex": "(.+)\/.+\/.+\\.parquet" } } }, { "@type": "cr:Field", - "@id": "en/url", - "name": "url", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "fileSet": { - "@id": "parquet-files" - }, - "extract": { - "column": "url" - } - } - } - ] - }, - { - "@type": "cr:RecordSet", - "@id": "realnewslike", - "name": "realnewslike", - "description": "The realnewslike set of records in the dataset.", - "field": [ - { - "@type": "cr:Field", - "@id": "realnewslike/text", - "name": "text", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "fileSet": { - "@id": "parquet-files" - }, - "extract": { - "column": "text" - } - } - }, - { - "@type": "cr:Field", - "@id": "realnewslike/timestamp", - "name": "timestamp", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "fileSet": { - "@id": "parquet-files" - }, - "extract": { - "column": "timestamp" - } - } - }, - { - "@type": "cr:Field", - "@id": "realnewslike/url", - "name": "url", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "fileSet": { - "@id": "parquet-files" - }, - "extract": { - "column": "url" - } - } - } - ] - }, - { - "@type": "cr:RecordSet", - "@id": "en.noblocklist", - "name": "en.noblocklist", - "description": "The en.noblocklist set of records in the dataset.", - "field": [ - { - "@type": "cr:Field", - "@id": "en.noblocklist/text", - "name": "text", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "fileSet": { - "@id": "parquet-files" - }, - "extract": { - "column": "text" - } - } - }, - { - "@type": "cr:Field", - "@id": "en.noblocklist/timestamp", - "name": "timestamp", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "fileSet": { - "@id": "parquet-files" - }, - "extract": { - "column": "timestamp" - } - } - }, - { - "@type": "cr:Field", - "@id": "en.noblocklist/url", - "name": "url", - "description": "Column from Hugging Face parquet file.", - "dataType": "sc:Text", - "source": { - "fileSet": { - "@id": "parquet-files" - }, - "extract": { - "column": "url" - } - } - } - ] - }, - { - "@type": "cr:RecordSet", - "@id": "en.noclean", - "name": "en.noclean", - "description": "The en.noclean set of records in the dataset.", - "field": [ - { - "@type": "cr:Field", - "@id": "en.noclean/text", + "@id": "text", "name": "text", "description": "Column from Hugging Face parquet file.", "dataType": "sc:Text", @@ -257,7 +116,7 @@ }, { "@type": "cr:Field", - "@id": "en.noclean/timestamp", + "@id": "timestamp", "name": "timestamp", "description": "Column from Hugging Face parquet file.", "dataType": "sc:Text", @@ -272,7 +131,7 @@ }, { "@type": "cr:Field", - "@id": "en.noclean/url", + "@id": "url", "name": "url", "description": "Column from Hugging Face parquet file.", "dataType": "sc:Text", diff --git a/datasets/1.0/huggingface-c4/output/data.jsonl b/datasets/1.0/huggingface-c4/output/data.jsonl new file mode 100644 index 000000000..2fc314177 --- /dev/null +++ b/datasets/1.0/huggingface-c4/output/data.jsonl @@ -0,0 +1 @@ +{"variant": "en", "text": "Beginners BBQ Class Taking Place in Missoula!\nDo you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.\nHe will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.\nThe cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.", "timestamp": "2019-04-25 12:57:54", "url": "https://klyq.com/beginners-bbq-class-taking-place-in-missoula/"} diff --git a/datasets/1.0/huggingface-c4/output/en.jsonl b/datasets/1.0/huggingface-c4/output/en.jsonl deleted file mode 100644 index 17e17db7a..000000000 --- a/datasets/1.0/huggingface-c4/output/en.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"text": "Beginners BBQ Class Taking Place in Missoula!\nDo you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.\nHe will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.\nThe cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.", "timestamp": "2019-04-25T12:57:54Z", "url": "https://klyq.com/beginners-bbq-class-taking-place-in-missoula/"} diff --git a/python/mlcroissant/mlcroissant/_src/datasets.py b/python/mlcroissant/mlcroissant/_src/datasets.py index 1dcdf88b1..f81b1ffb9 100644 --- a/python/mlcroissant/mlcroissant/_src/datasets.py +++ b/python/mlcroissant/mlcroissant/_src/datasets.py @@ -198,7 +198,7 @@ def _find_data_field_to_filter( for operation in operations: if isinstance(operation, ReadFields): for field in operation.node.fields: - if field.id in filters: + if field.uuid in filters: return field, filters[field.id] raise ValueError( f"Filters ({filters}) do not apply to the fields. `filters` must be a" @@ -208,13 +208,21 @@ def _find_data_field_to_filter( def _regex_to_glob(regex: str) -> str: - """Converts a regular expression to a blob pattern by unescaping regex syntax.""" + """Converts a regular expression to a blob pattern by unescaping regex syntax. + + Warning: this is based on manual heuristics to convert a regular expression to a + glob expression. + """ # Remove starting ^ regex = re.sub(r"^\^", "", regex) # Remove trailing $ regex = re.sub(r"\$$", "", regex) # Interpret \. as . regex = re.sub(r"\\\.", ".", regex) + # Interpret .* as * + regex = re.sub(r"\.\*", "*", regex) + # Interpret .+ as * + regex = re.sub(r"\.\+", "*", regex) return regex @@ -268,7 +276,7 @@ def _propagate_includes(field: Field, operations: nx.Graph[Operation], new_regex for operation in operations: if isinstance(operation, FilterFiles): node = operation.node - if node.id == source_uuid and new_regex: + if node.uuid == source_uuid and new_regex: includes = node.includes or [] if source_type == FileProperty.filename: new_includes = [] @@ -281,13 +289,13 @@ def _propagate_includes(field: Field, operations: nx.Graph[Operation], new_regex new_includes.append("/".join(new_pattern)) node.includes = new_includes elif source_type == FileProperty.fullpath: - node.includes = [_regex_to_glob(pattern) for pattern in includes] + node.includes = [_regex_to_glob(new_regex) for _ in includes] else: raise NotImplementedError(error) def _validate_filters(filters: Filters): - if isinstance(filters, Mapping): + if isinstance(filters, Mapping) and len(filters) <= 1: if all(isinstance(value, str) for value in filters.values()): return raise ValueError( diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py index fdd51bdbc..6f2f3ea43 100644 --- a/python/mlcroissant/mlcroissant/_src/datasets_test.py +++ b/python/mlcroissant/mlcroissant/_src/datasets_test.py @@ -180,7 +180,6 @@ def test_hermetic_loading_1_0(dataset_name, record_set_name, num_records, filter 10, ], ["gpt-3/metadata.json", "default", 10], - ["huggingface-c4/metadata.json", "en", 1], ["huggingface-mnist/metadata.json", "default", 10], ["titanic/metadata.json", "passengers", -1], ], @@ -192,13 +191,16 @@ def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records # Non-hermetic test cases for croissant >=1.0 only (data from the internet). @pytest.mark.nonhermetic @pytest.mark.parametrize( - ["dataset_name", "record_set_name", "num_records"], + ["dataset_name", "record_set_name", "num_records", "filters"], [ - ["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10], + ["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None], + ["huggingface-c4/metadata.json", "data", 1, {"variant": "en"}], ], ) -def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records): - load_records_and_test_equality("1.0", dataset_name, record_set_name, num_records) +def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters): + load_records_and_test_equality( + "1.0", dataset_name, record_set_name, num_records, filters + ) @pytest.mark.nonhermetic @@ -258,6 +260,7 @@ def test_cypress_fixtures(version): [ [{}, False], [{"split": "test"}, False], + [{"split": "test", "other_filter": "foo"}, True], [{"split": ["train", "test"]}, True], [{"split": 1}, True], ],