Lightning-AI · carmocca · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
@@ -2,7 +2,7 @@
 
 from lit_gpt.data.base import LitDataModule, SFTDataset, apply_prompt_template, get_sft_collate_fn
 from lit_gpt.data.alpaca import Alpaca
-from lit_gpt.data.csv import CSV
+from lit_gpt.data.json import JSON
 from lit_gpt.data.dolly import Dolly
 from lit_gpt.data.flan import FLAN
 from lit_gpt.data.lima import LIMA

@@ -2,7 +2,7 @@
 
 import json
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 from torch.utils.data import random_split, DataLoader
@@ -11,25 +11,34 @@
 from lit_gpt.tokenizer import Tokenizer
 
 
-class CSV(LitDataModule):
-    """Loads CSV data for supervised finetuning.
+class JSON(LitDataModule):
+    """Loads JSON data for supervised finetuning.
 
     Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
+
+    Args:
+        json_path: A path to a JSON file containing the data. The file should contain a list of samples (dicts).
+            Each dict must have the keys 'instruction' and 'output', and can optionally have a key 'input'
+            (see Alpaca).
+        mask_prompt: Whether to mask the prompt section from the label (with ``ignore_index``).
+        test_split_fraction: A number in the range [0, 1] that determines the fraction of the dataset
+            to use for testing.
+        ignore_index: The index to use for elements to be ignored in the label.
+        seed: The random seed for creating the train/val splits and shuffling the dataset.
+        num_workers: How many DataLoader processes to use for loading.
     """
 
     def __init__(
         self,
-        csv_path: Path,
-        columns: Tuple[str, ...] = ("instruction", "input", "output"),
+        json_path: Path,
         mask_prompt: bool = False,
         test_split_fraction: float = 0.1,
         ignore_index: int = -1,
         seed: int = 42,
         num_workers: int = 4,
     ) -> None:
         super().__init__()
-        self.csv_path = csv_path
-        self.columns = columns
+        self.json_path = json_path
         self.mask_prompt = mask_prompt
         self.test_split_fraction = test_split_fraction
         self.ignore_index = ignore_index
@@ -42,6 +51,9 @@ def __init__(
         self.train_dataset: Optional[SFTDataset] = None
         self.test_dataset: Optional[SFTDataset] = None
 
+        if not self.json_path.is_file():
+            raise FileNotFoundError(f"The file {self.json_path} does not exist.")
+
     def connect(
         self,
         tokenizer: Optional[Tokenizer] = None,
@@ -53,12 +65,8 @@ def connect(
         self.max_seq_length = -1 if max_seq_length is None else max_seq_length
 
     def setup(self, stage: str = "") -> None:
-        import pandas as pd
-
-        df = pd.read_csv(self.csv_path, dtype=str).fillna("")
-        if not (df.columns.values == self.columns).all():
-            raise ValueError(f"CSV columns must be {self.columns}, found {df.columns.values}")
-        data = json.loads(df.to_json(orient="records", indent=4))
+        with open(self.json_path, "r", encoding="utf-8") as file:
+            data = json.load(file)
 
         # Partition the dataset into train and test
         train_data, test_data = random_split(

@@ -1,33 +1,38 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
-
+import json
 from unittest import mock
 
+import pytest
+
+
+@mock.patch("lit_gpt.data.json.prompt_template", "X: {instruction} {input} Y:")
+def test_json(tmp_path, mock_tockenizer):
+    from lit_gpt.data import JSON
+
+    json_path = tmp_path / "data.json"
+    mock_data = [
+        {"instruction": "Add", "input": "2+2", "output": "4"},
+        {"instruction": "Subtract", "input": "5-3", "output": "2"},
+        {"instruction": "Multiply", "input": "6*4", "output": "24"},
+        {"instruction": "Divide", "input": "10/2", "output": "5"},
+        {"instruction": "Exponentiate", "input": "2^3", "output": "8"},
+        {"instruction": "Square root", "input": "√9", "output": "3"},
+    ]
 
-@mock.patch("lit_gpt.data.csv.prompt_template", "X: {instruction} {input} Y:")
-def test_csv(tmp_path, mock_tockenizer):
-    from lit_gpt.data import CSV
+    with open(json_path, "w", encoding="utf-8") as fp:
+        json.dump(mock_data, fp)
 
-    csv_path = tmp_path / "data.csv"
-    mock_data = (
-        "instruction,input,output\n"
-        "Add,2+2,4\n"
-        "Subtract,5-3,2\n"
-        "Multiply,6*4,24\n"
-        "Divide,10/2,5\n"
-        "Exponentiate,2^3,8\n"
-        "Square root,√9,3\n"
-    )
-    with open(csv_path, "w", encoding="utf-8") as fp:
-        fp.write(mock_data)
+    with pytest.raises(FileNotFoundError):
+        JSON(tmp_path / "not exist")
 
     # TODO: Make prompt template an argumenet
-    csv = CSV(csv_path, test_split_fraction=0.5, num_workers=0)
-    csv.connect(tokenizer=mock_tockenizer, batch_size=2)
-    csv.prepare_data()  # does nothing
-    csv.setup()
+    data = JSON(json_path, test_split_fraction=0.5, num_workers=0)
+    data.connect(tokenizer=mock_tockenizer, batch_size=2)
+    data.prepare_data()  # does nothing
+    data.setup()
 
-    train_dataloader = csv.train_dataloader()
-    val_dataloader = csv.val_dataloader()
+    train_dataloader = data.train_dataloader()
+    val_dataloader = data.val_dataloader()
 
     assert len(train_dataloader) == 2
     assert len(val_dataloader) == 2