diff --git a/apps/python/register/README.md b/apps/python/register/README.md index 784e42b..28dd0c3 100644 --- a/apps/python/register/README.md +++ b/apps/python/register/README.md @@ -64,7 +64,7 @@ python3 apps/python/register/register.py \ # Dockers -* (Optional): Prepare a postgreSQL from `apps/python/postgresql` running `docker compose up` +* (Optional): Prepare a postgreSQL from `infrastructure/dev/postgresql` running `docker compose up postgresql pgadmin` 1. Build register `./build.sh` @@ -76,11 +76,26 @@ docker run -it --network host pocket_dataset_register \ /code/register.py \ --tasks arc_challenge,hellaswag,truthfulqa_mc2,mmlu_abstract_algebra,mmlu_anatomy,mmlu_astronomy,mmlu_business_ethics,mmlu_clinical_knowledge,mmlu_college_biology,mmlu_college_chemistry,mmlu_college_computer_science,mmlu_college_mathematics,mmlu_college_medicine,mmlu_college_physics,mmlu_computer_security,mmlu_conceptual_physics,mmlu_econometrics,mmlu_electrical_engineering,mmlu_elementary_mathematics,mmlu_formal_logic,mmlu_global_facts,mmlu_high_school_biology,mmlu_high_school_chemistry,mmlu_high_school_computer_science,mmlu_high_school_european_history,mmlu_high_school_geography,mmlu_high_school_government_and_politics,mmlu_high_school_macroeconomics,mmlu_high_school_mathematics,mmlu_high_school_microeconomics,mmlu_high_school_physics,mmlu_high_school_psychology,mmlu_high_school_statistics,mmlu_high_school_us_history,mmlu_high_school_world_history,mmlu_human_aging,mmlu_human_sexuality,mmlu_international_law,mmlu_jurisprudence,mmlu_logical_fallacies,mmlu_machine_learning,mmlu_management,mmlu_marketing,mmlu_medical_genetics,mmlu_miscellaneous,mmlu_moral_disputes,mmlu_moral_scenarios,mmlu_nutrition,mmlu_philosophy,mmlu_prehistory,mmlu_professional_accounting,mmlu_professional_law,mmlu_professional_medicine,mmlu_professional_psychology,mmlu_public_relations,mmlu_security_studies,mmlu_sociology,mmlu_us_foreign_policy,mmlu_virology,mmlu_world_religions,winogrande,gsm8k \ --dbname lm-evaluation-harness \ ---user root \ ---password root \ +--user admin \ +--password admin \ --host localhost \ --port 5432 \ --verbosity DEBUG ``` **Note**:If you have already downloaded HF datasets, mount them adding `-v path/to/huggingface/directory:/root/.cache/huggingface/` to avoid re-download. + + +### Accessing the DB with PG Admin + +To explore the generated database, the PG Admin is available in the docker compose (`infrastructure/dev/docker-compose.yaml`). +To access the service just go to `127.0.0.1:5050` and use the credentials `admin@local.dev:admin`. +Then in the PG Admin page click on `Add New Server` and fill the data: +General tab: +- Name: `pokt-ml-datasets` +Connection tab: +- Host Name: `postgres_container` +- Port: `5432` +- Maintenance database: `lm-evaluation-harness` +- Username: `admin` +- Password: `admin` \ No newline at end of file diff --git a/apps/python/register/register.py b/apps/python/register/register.py index 12ed343..a344de6 100644 --- a/apps/python/register/register.py +++ b/apps/python/register/register.py @@ -1,197 +1,8 @@ -################################ -# lm-eval-harness (evaulator.py) -################################ -import argparse -import json -import logging -import os -import re -import sys -from functools import partial -from pathlib import Path -from typing import Union +import utils.lmeh as lmeh -import numpy as np - -from lm_eval import evaluator, utils -from lm_eval.evaluator import request_caching_arg_to_dict -from lm_eval.logging_utils import WandbLogger -from lm_eval.tasks import TaskManager, include_path, initialize_tasks -from lm_eval.utils import make_table, simple_parse_args_string - -################################ -# Custom modules -################################ -from utils.uploader import get_ConfigurableTask -from utils.sql import create_dataset_table, register_task, create_task_table, checked_task -import psycopg2 -from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT - -def parse_eval_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument( - "--tasks", - "-t", - default=None, - metavar="task1,task2", - help="To get full list of tasks, use the command lm-eval --tasks list", - ) - parser.add_argument( - "--dbname", - type=str, - default="postgres", - help="Name of the database", - ) - parser.add_argument( - "--user", - type=str, - default="postgres", - help="Name of the user", - ) - parser.add_argument( - "--password", - type=str, - default="password", - help="Password for the user", - ) - parser.add_argument( - "--host", - type=str, - default="localhost", - help="Host name", - ) - parser.add_argument( - "--port", - type=str, - default="5432", - help="Port number", - ) - parser.add_argument( - "--include_path", - type=str, - default=None, - metavar="DIR", - help="Additional path to include if there are external tasks to include.", - ) - parser.add_argument( - "--verbosity", - "-v", - type=str.upper, - default="INFO", - metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG", - help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.", - ) - return parser.parse_args() - -def cli_register_task(args: Union[argparse.Namespace, None] = None) -> None: - if not args: - # we allow for args to be passed externally, else we parse them ourselves - args = parse_eval_args() - - eval_logger = utils.eval_logger - eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) - eval_logger.info(f"Verbosity set to {args.verbosity}") - - initialize_tasks(args.verbosity) - task_manager = TaskManager(args.verbosity, include_path=args.include_path) - - if args.include_path is not None: - eval_logger.info(f"Including path: {args.include_path}") - include_path(args.include_path) - - if args.tasks is None: - eval_logger.error("Need to specify task to evaluate.") - sys.exit() - elif args.tasks == "list": - eval_logger.info( - "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks)) - ) - sys.exit() - else: - if os.path.isdir(args.tasks): - import glob - - task_names = [] - yaml_path = os.path.join(args.tasks, "*.yaml") - for yaml_file in glob.glob(yaml_path): - config = utils.load_yaml_config(yaml_file) - task_names.append(config) - else: - task_list = args.tasks.split(",") - task_names = task_manager.match_tasks(task_list) - for task in [task for task in task_list if task not in task_names]: - if os.path.isfile(task): - config = utils.load_yaml_config(task) - task_names.append(config) - task_missing = [ - task for task in task_list if task not in task_names and "*" not in task - ] # we don't want errors if a wildcard ("*") task name was used - - if task_missing: - missing = ", ".join(task_missing) - eval_logger.error( - f"Tasks were not found: {missing}\n" - f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks", - ) - raise ValueError( - f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues." - ) - - task_dict = get_ConfigurableTask( - tasks=task_names, - num_fewshot=None, - check_integrity=False, - gen_kwargs=None, - task_manager= None, - verbosity= "INFO", - predict_only= False, - ) - - # check and connect to the database - try: - conn = psycopg2.connect( - dbname=args.dbname, - user=args.user, - password=args.password, - host=args.host, - port=args.port - ) - eval_logger.info("Connected to the database") - # Obtain a DB Cursor - cursor = conn.cursor() - except Exception as e: - eval_logger.error("Unable to connect to the database") - exit(-1) - - create_task_table(connection=conn) +if __name__ == "__main__": - for t in task_dict: - task_name_i = t - dataset_path = task_dict[t].config.dataset_path - dataset_name = task_dict[t].config.dataset_name - table_name = dataset_path + "--" + dataset_name if dataset_name else dataset_path - data = task_dict[t].dataset - # check if the task is already registered - if not checked_task(task_name_i, connection= conn): - # Register task - try: - # Create dataset table - create_dataset_table(table_name = table_name, - data = data, - connection = conn) - # Regist task/dataset pair - register_task(task_name = task_name_i, - dataset_table_name = table_name, - connection = conn) - except Exception as e: - eval_logger.error(f"Error: {e}") - conn.rollback() - cursor.close() - conn.close() - exit(-1) - eval_logger.info(f"Task {task_name_i} registered successfully") - else: - eval_logger.info(f"Task {task_name_i} already registered") + # This is the task to upload datasets -if __name__ == "__main__": - cli_register_task() \ No newline at end of file + # TODO: read configuration from file and pass it to the function (mutiple tests support requirement) + lmeh.cli_register_task() \ No newline at end of file diff --git a/apps/python/register/requirements.txt b/apps/python/register/requirements.txt index c16fa3d..6b1cf98 100644 --- a/apps/python/register/requirements.txt +++ b/apps/python/register/requirements.txt @@ -1,2 +1,2 @@ -lm_eval[dev] @ git+https://github.com/EleutherAI/lm-evaluation-harness@7d9922c80114218eaf43975b7655bb48cda84f50 +lm_eval[dev]==0.4.2 psycopg2==2.9.9 \ No newline at end of file diff --git a/apps/python/register/utils/lmeh.py b/apps/python/register/utils/lmeh.py new file mode 100644 index 0000000..0db7d51 --- /dev/null +++ b/apps/python/register/utils/lmeh.py @@ -0,0 +1,291 @@ +import argparse +import logging +import os +import sys +from typing import Union +from lm_eval import utils +from lm_eval.tasks import TaskManager +from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string +from lm_eval.tasks import TaskManager, get_task_dict +from lm_eval.evaluator_utils import run_task_tests +from lm_eval.__main__ import parse_eval_args +from typing import List, Optional, Union + + +from utils.sql import create_dataset_table, register_task, create_task_table, checked_task +import psycopg2 + + +def setup_parser() -> argparse.Namespace: + ''' + Argument parsing for LM-Evaluation-Harness dataset uploading. + ''' + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + "--tasks", + "-t", + default=None, + type=str, + metavar="task1,task2", + help="To get full list of tasks, use the command lm-eval --tasks list", + ) + parser.add_argument( + "--dbname", + type=str, + default="lm-evaluation-harness", + help="Name of the database", + ) + parser.add_argument( + "--user", + type=str, + default="admin", + help="Name of the user", + ) + parser.add_argument( + "--password", + type=str, + default="admin", + help="Password for the user", + ) + parser.add_argument( + "--host", + type=str, + default="localhost", + help="Host name", + ) + parser.add_argument( + "--port", + type=str, + default="5432", + help="Port number", + ) + parser.add_argument( + "--include_path", + type=str, + default=None, + metavar="DIR", + help="Additional path to include if there are external tasks to include.", + ) + parser.add_argument( + "--verbosity", + "-v", + type=str.upper, + default="INFO", + metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG", + help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.", + ) + return parser + +def cli_register_task(args: Union[argparse.Namespace, None] = None) -> None: + ''' + LM Evaluation Harness dataset uploading. + + This function takes the selected tasks and fill the database with all + requiered datasets. + ''' + if not args: + # we allow for args to be passed externally, else we parse them ourselves + parser = setup_parser() + args = parse_eval_args(parser) + + eval_logger = utils.eval_logger + eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) + eval_logger.info(f"Verbosity set to {args.verbosity}") + + if args.include_path is not None: + eval_logger.info(f"Including path: {args.include_path}") + task_manager = TaskManager(args.verbosity, include_path=args.include_path) + + if args.tasks is None: + eval_logger.error("Need to specify task to evaluate.") + sys.exit() + elif args.tasks == "list": + eval_logger.info( + "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks)) + ) + sys.exit() + else: + if os.path.isdir(args.tasks): + import glob + + task_names = [] + yaml_path = os.path.join(args.tasks, "*.yaml") + for yaml_file in glob.glob(yaml_path): + config = utils.load_yaml_config(yaml_file) + task_names.append(config) + else: + task_list = args.tasks.split(",") + task_names = task_manager.match_tasks(task_list) + for task in [task for task in task_list if task not in task_names]: + if os.path.isfile(task): + config = utils.load_yaml_config(task) + task_names.append(config) + task_missing = [ + task for task in task_list if task not in task_names and "*" not in task + ] # we don't want errors if a wildcard ("*") task name was used + + if task_missing: + missing = ", ".join(task_missing) + eval_logger.error( + f"Tasks were not found: {missing}\n" + f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks", + ) + raise ValueError( + f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues." + ) + + task_dict = get_ConfigurableTask( + tasks=task_names, + num_fewshot=None, + check_integrity=False, + gen_kwargs=None, + task_manager= None, + verbosity= "INFO", + predict_only= False, + ) + + # check and connect to the database + try: + conn = psycopg2.connect( + dbname=args.dbname, + user=args.user, + password=args.password, + host=args.host, + port=args.port + ) + eval_logger.info("Connected to the database") + # Obtain a DB Cursor + cursor = conn.cursor() + except Exception as e: + eval_logger.error("Unable to connect to the database") + exit(-1) + + create_task_table(connection=conn) + + for t in task_dict: + task_name_i = t + dataset_path = task_dict[t].config.dataset_path + dataset_name = task_dict[t].config.dataset_name + table_name = dataset_path + "--" + dataset_name if dataset_name else dataset_path + data = task_dict[t].dataset + # check if the task is already registered + if not checked_task(task_name_i, connection= conn): + # Register task + try: + # Create dataset table + create_dataset_table(table_name = table_name, + data = data, + connection = conn) + # Regist task/dataset pair + register_task(task_name = task_name_i, + dataset_table_name = table_name, + connection = conn) + except Exception as e: + eval_logger.error(f"Error: {e}") + conn.rollback() + cursor.close() + conn.close() + exit(-1) + eval_logger.info(f"Task {task_name_i} registered successfully") + else: + eval_logger.info(f"Task {task_name_i} already registered") + + +# cutted def simple_evaluate(..) from lm-eval-harness to generate config task +@positional_deprecated +def get_ConfigurableTask( + tasks: Optional[List[Union[str, dict, object]]] = None, + num_fewshot: Optional[int] = None, + check_integrity: bool = False, + gen_kwargs: Optional[str] = None, + task_manager: Optional[TaskManager] = None, + verbosity: str = "INFO", + predict_only: bool = False, + +): + """Instantiate and evaluate a model on a list of tasks. + + :param tasks: list[Union[str, dict, Task]] + List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. + :param num_fewshot: int + Number of examples in few-shot context + :param check_integrity: bool + Whether to run the relevant part of the test suite for the tasks + :param gen_kwargs: str + String arguments for model generation + Ignored for all tasks with loglikelihood output_type + :param predict_only: bool + If true only model outputs will be generated and returned. Metrics will not be evaluated + + :return + Task dictionary + """ + eval_logger.setLevel(getattr(logging, f"{verbosity}")) + + seed_message = [] + + if seed_message: + eval_logger.info(" | ".join(seed_message)) + + if tasks is None: + tasks = [] + if len(tasks) == 0: + raise ValueError( + "No tasks specified, or no tasks found. Please verify the task names." + ) + + if gen_kwargs is not None: + gen_kwargs = simple_parse_args_string(gen_kwargs) + eval_logger.warning( + "generation_kwargs specified through cli, these settings will update set parameters in yaml tasks. " + "Ensure 'do_sample=True' for non-greedy decoding!" + ) + if gen_kwargs == "": + gen_kwargs = None + + if task_manager is None: + task_manager = TaskManager(verbosity) + + task_dict = get_task_dict(tasks, task_manager) + for task_name in task_dict.keys(): + task_obj = task_dict[task_name] + if isinstance(task_obj, tuple): + _, task_obj = task_obj + if task_obj is None: + continue + + if task_obj.get_config("output_type") == "generate_until": + if gen_kwargs is not None: + task_obj.set_config( + key="generation_kwargs", value=gen_kwargs, update=True + ) + + if predict_only: + log_samples = True + eval_logger.info( + f"Processing {task_name} in output-only mode. Metrics will not be calculated!" + ) + # we have to change the class properties post-hoc. This is pretty hacky. + task_obj.override_metric(metric_name="bypass") + + # override tasks' fewshot values to the provided num_fewshot arg value + # except if tasks have it set to 0 manually in their configs--then we should never overwrite that + if num_fewshot is not None: + if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: + eval_logger.info( + f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored." + ) + else: + eval_logger.warning( + f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" + ) + task_obj.set_config(key="num_fewshot", value=num_fewshot) + else: + # if num_fewshot not provided, and the task does not define a default one, default to 0 + if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None: + task_obj.set_config(key="num_fewshot", value=0) + + if check_integrity: + run_task_tests(task_list=tasks) + + return task_dict diff --git a/apps/python/register/utils/sql.py b/apps/python/register/utils/sql.py index 539f511..96bb4f6 100644 --- a/apps/python/register/utils/sql.py +++ b/apps/python/register/utils/sql.py @@ -5,7 +5,6 @@ import decimal import datasets -from datasets import DatasetDict # Define the columns for the table _ID_NAME = "__id" diff --git a/apps/python/sampler/Dockerfile b/apps/python/sampler/Dockerfile new file mode 100644 index 0000000..55f4c01 --- /dev/null +++ b/apps/python/sampler/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11.8-slim-bookworm + +RUN apt-get update -y && \ + apt-get install -y libpq-dev gcc git + +RUN mkdir -p /code +COPY ./requirements_dev.txt /code/requirements_dev.txt +RUN pip install --no-cache-dir -r /code/requirements_dev.txt + + + +# Mirror folder without unnecesary files +COPY ./sampler.py /code/sampler.py +COPY ./utils /code/utils + +ENTRYPOINT ["python3"] \ No newline at end of file diff --git a/apps/python/sampler/README.md b/apps/python/sampler/README.md index 248a27d..b1867f7 100644 --- a/apps/python/sampler/README.md +++ b/apps/python/sampler/README.md @@ -1,3 +1,31 @@ +# pocket_lm_eval +Files that follow the structure of `lm-eval-harness`. The intention, for instance, is to avoid a fork. + +## pocket_lm_eval - task +* **[New]** `PocketNetworkTaskManager`: A class based on `TaskManager`, that is used to inject `pocket_args` into the `task.config.metadata`. + +## pocket_lm_eval - api +* **[New]** `PocketNetworkConfigurableTask`: A class based on `ConfigurableTask`, that retrieve samples from the sql database, based on `blacklist` id's & `uri` previously defined in `pocket_args`. In `PocketNetworkConfigurableTask.download` validations reagrding `training_split`, `validation_split`, `test_split` and `fewshot_split` are followed as pointed in the `lm-eval-harness- documentation. + +# generator + +* **[New]** A functions `get_ConfigurableTask` to return only the random samples based of the configiguration split and the blacklist. + +# Docker Run + +```bash +docker run -it --network host pocket_sampler \ +/code/sampler.py \ +--pocket_args '{"hellaswag": {"address": "random", "blacklist": [49908, 59949], "qty": 15}}' \ +--dbname lm-evaluation-harness \ +--user admin \ +--password admin \ +--host localhost \ +--port 5432 \ +--verbosity DEBUG +``` + + ### Create a copy of config.json and set the proper values for your environment ### Run CONFIG_PATH=/your/path/to/config.json python worker/main.py diff --git a/apps/python/sampler/build.sh b/apps/python/sampler/build.sh new file mode 100755 index 0000000..1a4cf6a --- /dev/null +++ b/apps/python/sampler/build.sh @@ -0,0 +1,6 @@ +DEFAULT_IMAGE_NAME="pocket_sampler" + +# Build register +docker build . --progress=plain --tag $DEFAULT_IMAGE_NAME:latest +# Broadcast image name and tag +echo "$DEFAULT_IMAGE_NAME" \ No newline at end of file diff --git a/apps/python/sampler/requirements_dev.txt b/apps/python/sampler/requirements_dev.txt new file mode 100644 index 0000000..711b3dc --- /dev/null +++ b/apps/python/sampler/requirements_dev.txt @@ -0,0 +1,3 @@ +lm_eval[dev]==0.4.2 +psycopg2==2.9.9 +SQLAlchemy==2.0.29 \ No newline at end of file diff --git a/apps/python/sampler/sampler.py b/apps/python/sampler/sampler.py new file mode 100644 index 0000000..fbc529e --- /dev/null +++ b/apps/python/sampler/sampler.py @@ -0,0 +1,6 @@ +import utils.lmeh as lmeh + +if __name__ == "__main__": + + # This is the task to generate samples request + lmeh.cli_register_task() \ No newline at end of file diff --git a/apps/python/sampler/utils/__init__.py b/apps/python/sampler/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/python/register/utils/uploader.py b/apps/python/sampler/utils/generator.py similarity index 80% rename from apps/python/register/utils/uploader.py rename to apps/python/sampler/utils/generator.py index 689501c..e4ab321 100644 --- a/apps/python/register/utils/uploader.py +++ b/apps/python/sampler/utils/generator.py @@ -1,8 +1,9 @@ import itertools import logging import random +import time from collections import defaultdict -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Union import numpy as np import torch @@ -21,6 +22,7 @@ ) from lm_eval.logging_utils import add_env_info, get_git_commit_hash from lm_eval.tasks import TaskManager, get_task_dict +from utils.pocket_lm_eval.tasks import PocketNetworkTaskManager from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string @@ -29,7 +31,7 @@ from lm_eval.tasks import Task -# cutted def simple_evaluate(..) from lm-eval-harness to generate config task commit:7d9922c80114218eaf43975b7655bb48cda84f50 +# adapted from evaluator.py # def simple_evaluate(..) from lm-eval-harness to generate config task @positional_deprecated def get_ConfigurableTask( tasks: Optional[List[Union[str, dict, object]]] = None, @@ -39,6 +41,7 @@ def get_ConfigurableTask( task_manager: Optional[TaskManager] = None, verbosity: str = "INFO", predict_only: bool = False, + pocket_args: Optional[Dict] = None, ): """Instantiate and evaluate a model on a list of tasks. @@ -59,6 +62,7 @@ def get_ConfigurableTask( Task dictionary """ eval_logger.setLevel(getattr(logging, f"{verbosity}")) + start_date = time.time() seed_message = [] @@ -82,12 +86,8 @@ def get_ConfigurableTask( gen_kwargs = None if task_manager is None: - task_manager = TaskManager(verbosity) + task_manager = PocketNetworkTaskManager(verbosity, pocket_args=pocket_args) - eval_logger.info( - "get_task_dict has been updated to accept an optional argument, `task_manager`" - "Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage" - ) task_dict = get_task_dict(tasks, task_manager) for task_name in task_dict.keys(): task_obj = task_dict[task_name] @@ -110,6 +110,8 @@ def get_ConfigurableTask( # we have to change the class properties post-hoc. This is pretty hacky. task_obj.override_metric(metric_name="bypass") + # override tasks' fewshot values to the provided num_fewshot arg value + # except if tasks have it set to 0 manually in their configs--then we should never overwrite that if num_fewshot is not None: if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: eval_logger.info( @@ -120,8 +122,14 @@ def get_ConfigurableTask( f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" ) task_obj.set_config(key="num_fewshot", value=num_fewshot) + else: + # if num_fewshot not provided, and the task does not define a default one, default to 0 + if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None: + task_obj.set_config(key="num_fewshot", value=0) if check_integrity: run_task_tests(task_list=tasks) - return task_dict \ No newline at end of file + return task_dict + +# adapted from evaluator.py # def evaluate(..) from lm-eval-harness to generate config task \ No newline at end of file diff --git a/apps/python/sampler/utils/lmeh.py b/apps/python/sampler/utils/lmeh.py new file mode 100644 index 0000000..07d6250 --- /dev/null +++ b/apps/python/sampler/utils/lmeh.py @@ -0,0 +1,172 @@ +################################ +# lm-eval-harness (evaulator.py) +################################ +import argparse +import json +import logging +import os +import re +import sys +from functools import partial +from pathlib import Path +from typing import Union + +import numpy as np + +from lm_eval import evaluator, utils +from lm_eval.evaluator import request_caching_arg_to_dict +from lm_eval.logging_utils import WandbLogger +from lm_eval.tasks import TaskManager +from lm_eval.utils import make_table, simple_parse_args_string + +from lm_eval.__main__ import parse_eval_args + +# Custom modules +from utils.generator import get_ConfigurableTask + + +def setup_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + "--pocket_args", + type=str, + default='{"hellaswag": {"address": "random", "blacklist": [0, 157, 900, 1200], "qty": 5}, "gsmk8": {"address": "random", "blacklist": [0, 157, 900, 1200], "qty": 5}}', + help="json string of pocket arguments", + ) + parser.add_argument( + "--dbname", + type=str, + default="lm-evaluation-harness", + help="Name of the database", + ) + parser.add_argument( + "--user", + type=str, + default="admin", + help="Name of the user", + ) + parser.add_argument( + "--password", + type=str, + default="admin", + help="Password for the user", + ) + parser.add_argument( + "--host", + type=str, + default="localhost", + help="Host name", + ) + parser.add_argument( + "--port", + type=str, + default="5432", + help="Port number", + ) + parser.add_argument( + "--include_path", + type=str, + default=None, + metavar="DIR", + help="Additional path to include if there are external tasks to include.", + ) + parser.add_argument( + "--verbosity", + "-v", + type=str.upper, + default="INFO", + metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG", + help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.", + ) + return parser + +def cli_register_task(args: Union[argparse.Namespace, None] = None) -> None: + if not args: + # we allow for args to be passed externally, else we parse them ourselves + parser = setup_parser() + args = parse_eval_args(parser) + + eval_logger = utils.eval_logger + eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) + eval_logger.info(f"Verbosity set to {args.verbosity}") + ############################################################ + # START: POCKET NETWORK CODE + ############################################################ + POSTGRES_DB_USER = args.user + POSTGRES_DB_PASS = args.password + POSTGRES_DB_HOST = args.host + POSTGRES_DB_PORT = args.port + POSTGRES_DB_NAME = args.dbname + + postgres_uri = f"postgresql://{POSTGRES_DB_USER}:{POSTGRES_DB_PASS}@{POSTGRES_DB_HOST}:{POSTGRES_DB_PORT}/{POSTGRES_DB_NAME}" + + # Generate pocket_args from string + pocket_args = json.loads(args.pocket_args) + # join keys from pocket_args, adding "," between them + tasks = ",".join(pocket_args.keys()) + #then add the uri to the pocket_args to be used during ConfigurableTask init + for k in pocket_args.keys(): + pocket_args[k]['uri'] = postgres_uri + ############################################################ + # END: POCKET NETWORK CODE + ############################################################ + + if args.include_path is not None: + eval_logger.info(f"Including path: {args.include_path}") + task_manager = TaskManager(args.verbosity, include_path=args.include_path) + + if tasks is None: + eval_logger.error("Need to specify task to evaluate.") + sys.exit() + elif tasks == "list": + eval_logger.info( + "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks)) + ) + sys.exit() + else: + if os.path.isdir(tasks): + import glob + + task_names = [] + yaml_path = os.path.join(tasks, "*.yaml") + for yaml_file in glob.glob(yaml_path): + config = utils.load_yaml_config(yaml_file) + task_names.append(config) + else: + task_list = tasks.split(",") + task_names = task_manager.match_tasks(task_list) + for task in [task for task in task_list if task not in task_names]: + if os.path.isfile(task): + config = utils.load_yaml_config(task) + task_names.append(config) + task_missing = [ + task for task in task_list if task not in task_names and "*" not in task + ] # we don't want errors if a wildcard ("*") task name was used + + if task_missing: + missing = ", ".join(task_missing) + eval_logger.error( + f"Tasks were not found: {missing}\n" + f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks", + ) + raise ValueError( + f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues." + ) + + task_dict = get_ConfigurableTask( + tasks=task_names, + num_fewshot=None, + check_integrity=False, + gen_kwargs=None, + task_manager= None, + verbosity= "INFO", + predict_only= False, + pocket_args=pocket_args, + ) + + print(task_dict) + exit() + + +if __name__ == "__main__": + cli_register_task() \ No newline at end of file diff --git a/apps/python/sampler/utils/pocket_lm_eval/__init__.py b/apps/python/sampler/utils/pocket_lm_eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/python/sampler/utils/pocket_lm_eval/api/task.py b/apps/python/sampler/utils/pocket_lm_eval/api/task.py new file mode 100644 index 0000000..c356694 --- /dev/null +++ b/apps/python/sampler/utils/pocket_lm_eval/api/task.py @@ -0,0 +1,290 @@ +from typing import ( + Any, + Dict, + Iterable, + Iterator, + List, + Literal, + Mapping, + Optional, + Tuple, + Union, +) +import logging +import numpy as np +import random +import datasets +from lm_eval.api import samplers +from lm_eval.api.registry import ( + AGGREGATION_REGISTRY, + DEFAULT_METRIC_REGISTRY, + get_aggregation, + get_metric, + get_metric_aggregation, + is_higher_better, +) +from lm_eval.api.task import ConfigurableTask, TaskConfig, ALL_OUTPUT_TYPES +from lm_eval.filters import build_filter_ensemble +from lm_eval.prompts import get_prompt + +eval_logger = logging.getLogger("lm-eval") + +import psycopg2 +from urllib.parse import urlparse + +def get_max_min_ids(uri:str, table_name:str): + """ + This function connects to a PostgreSQL database and retrieves the min and max ids for each split + + Args: + uri: The URI of the PostgreSQL database + table_name: The name of the table in the database + + Returns: + A dictionary with the min and max ids for each split + Example: + { + 'train': {'min': 0, 'max': 100}, + 'validation': {'min': 101, 'max': 200}, + 'test': {'min': 201, 'max': 300} + } + """ + + try: + # Parse the URI to extract connection parameters + uri_parts = urlparse(uri) + dbname = uri_parts.path[1:] + username = uri_parts.username + password = uri_parts.password + host = uri_parts.hostname + port = uri_parts.port + + # Connect to your PostgreSQL database + conn = psycopg2.connect( + dbname=dbname, + user=username, + password=password, + host=host, + port=port + ) + + # Create a cursor object using the cursor() method + cursor = conn.cursor() + + # Construct the SQL query + sql_query = """ + SELECT + "__split", + MIN("__id") AS min_id, + MAX("__id") AS max_id + FROM + {} + GROUP BY + "__split"; + """.format(table_name) + + # Execute the SQL query + cursor.execute(sql_query) + + # Fetch all rows from the result + rows = cursor.fetchall() + _split_ranges = {} + # Print the result + for row in rows: + _split_ranges[row[0]] = {'min':row[1], 'max': row[2]} + + except (Exception, psycopg2.Error) as error: + print("Error while connecting to PostgreSQL", error) + + finally: + # Close the cursor and database connection + if conn: + cursor.close() + conn.close() + + return _split_ranges + +def get_split_from_ids(_split_ranges : dict, __ids:List[int]): + """ + This functions take a list of ids, and detect to which range they belong to + + Args: + _split_ranges: A dictionary with the min and max ids for each split + Example: + { + 'train': {'min': 0, 'max': 100}, + 'validation': {'min': 101, 'max': 200}, + 'test': {'min': 201, 'max': 300} + } + + __ids: A list of ids + Example + [202, 203, 204, 205] + + Returns: + The split range to which the ids belong to + Example: + 'test' + """ + split_ranges = {} + for k,v in _split_ranges.items(): + split_ranges[k] = set(range(v['min'],v['max']+1)) + + split_range = [] + for _id in __ids: + for k,v in split_ranges.items(): + if _id in v: + split_range.append(k) + break + # all ids should belong to a split range + if len(split_range) != len(__ids): + raise ValueError("Some ids do not belong to any split range") + + # all ids should belong to a unique split range + if len(set(split_range)) != 1: + raise ValueError("Some ids belong to more than one split. Please check that ids belong to only one split (test or validation).") + + + return list(set(split_range))[0] + + + + +class PocketNetworkConfigurableTask(ConfigurableTask): + + def check_split_exist(self, split: str, _split_ranges: dict): + """ + This function checks if a self.config.split exists in the keys of _split_ranges + """ + if split not in _split_ranges.keys(): + raise ValueError( + f"'{split}' split not found in _split_ranges: {_split_ranges.keys()}" + ) + + def add_string_ids_range(self, split: str, id_list_str: str, _split_ranges: dict): + """ + This function adds a range of ids to the id_list_str + + Args: + split: The split for which the range of ids should be added (this is one of self.config._split) + id_list_str: A string of ids separated by commas + _split_ranges: A dictionary with the min and max ids for each split + + Returns: + id_list_str: A string of ids separated by commas (to be used in a SQL query) + """ + min_range = _split_ranges[split]['min'] + max_range = _split_ranges[split]['max']+1 + eval_logger.debug(f"adding split \'{split}\' to id_list_str from ranges: {min_range}-{max_range}") + id_list_str += ', '.join(str(id) for id in range(min_range, max_range)) + return id_list_str + + def generate_random_numbers(self, table_name:str, _split:str, qty:int, min:int, max:int, blacklist: List[int] = []) -> List[int]: + """ + This function generates a list of random numbers within a range, excluding some blacklisted numbers + """ + # check that the quantity of numbers to generate is less than the range + if qty > (max - min + 1): + raise ValueError("Quantity of numbers to generate is greater than the range") + # Generate a list of random numbers within the range [min, max] excluding the blacklist + ints = set(range(min, max+1)) + if blacklist is not None: + original_len = len(ints) + # Remove the blacklisted numbers + ints = ints - set(blacklist) + # Check that the blacklist numbers were removed + if len(ints) == original_len: + raise ValueError("Blacklist numbers corresponding to '{}' table & '{}' split were not founded in the range [min, max] generated: [{}-{}]".format(table_name, _split, min, max)) + + choices = list(np.random.choice(list(ints), qty, replace=False)) + + return choices + + def get_SQL_where_clause(self, indexes, _split: str, _split_ranges: dict): + """ + This function constructs a WHERE clause for a SQL query. Apply the logic detailed in + + """ + + id_list_str = '' + if self.config.test_split: + eval_logger.debug("in self.config.test_split") + self.check_split_exist(self.config.test_split, _split_ranges) + if _split != self.config.test_split: + raise ValueError( + f"_split '{_split}' not equal to test_split '{self.config.test_split}'" + ) + + id_list_str += ', '.join(str(id) for id in indexes)+', ' + + eval_logger.debug(f"Test split:\n {id_list_str}") + if self.config.validation_split: + self.check_split_exist(self.config.validation_split, _split_ranges) + id_list_str = self.add_string_ids_range(self.config.validation_split, id_list_str, _split_ranges) + + if self.config.training_split: + self.check_split_exist(self.config.training_split, _split_ranges) + id_list_str = self.add_string_ids_range(self.config.training_split, id_list_str, _split_ranges) + + if self.config.fewshot_split: + self.check_split_exist(self.config.fewshot_split, _split_ranges) + id_list_str = self.add_string_ids_range(self.config.fewshot_split, id_list_str, _split_ranges) + + elif self.config.validation_split: + eval_logger.debug(f"in self.config.validation_split") + self.check_split_exist(self.config.validation_split, _split_ranges) + if _split != self.config.validation_split: + raise ValueError( + f"_split '{_split}' not equal to validation_split '{self.config.validation_split}'" + ) + id_list_str += ', '.join(str(id) for id in indexes)+', ' + eval_logger.debug(f"Validation split:\n {id_list_str}") + if self.config.training_split: + self.check_split_exist(self.config.training_split, _split_ranges) + id_list_str = self.add_string_ids_range(self.config.training_split, id_list_str, _split_ranges) + + if self.config.fewshot_split: + self.check_split_exist(self.config.fewshot_split, _split_ranges) + id_list_str = self.add_string_ids_range(self.config.fewshot_split, id_list_str, _split_ranges) + else: + # error + raise ValueError("Neither test_split nor validation_split in config, cannot proceed, please check get_SQL_where_clause") + + where_clause = f"__id IN ({id_list_str})" + + return where_clause + + def download(self, dataset_kwargs: Optional[Dict[str, Any]] = None) -> None: + + blacklist = self._config.metadata['pocket_args']['blacklist'] + qty = self._config.metadata['pocket_args']['qty'] + uri = self._config.metadata['pocket_args']['uri'] + table_name = self.DATASET_PATH + "--" + self.DATASET_NAME if self.DATASET_NAME else self.DATASET_PATH + _split_ranges = get_max_min_ids(table_name=table_name, uri=uri) + eval_logger.debug(f"Split ranges:\n{ _split_ranges}") + + # Its necesarry to detect wich is the split used to test to take the range, and then get random indexes + if self.config.test_split: + _split = self.config.test_split + # validate that the split exists in the _split_ranges + self.check_split_exist(_split, _split_ranges) + elif self.config.validation_split: + _split = self.config.validation_split + # validate that the split exists in the _split_ranges + self.check_split_exist(_split, _split_ranges) + else: + raise ValueError(f"Neither {self.config.test_split} nor {self.config.validation_split} in splits were found in '_split_ranges'. Available splits are {_split_ranges.keys()}") + + _range = _split_ranges[_split] + indexes = self.generate_random_numbers(table_name, _split, qty, _range['min'], _range['max'], blacklist) + + where_clause = self.get_SQL_where_clause(indexes, _split, _split_ranges) + # Construct the full SQL query + sql_query = f"SELECT * FROM {table_name} WHERE {where_clause};" + + ds = datasets.Dataset.from_sql(sql_query, con = uri) + dataset = datasets.DatasetDict() + for split in ds.unique("__split"): + eval_logger.debug(f"Split: {split}") + dataset[split] = ds.filter(lambda x: x["__split"] == split) + self.dataset = dataset.remove_columns(["__id", "__split"]) diff --git a/apps/python/sampler/utils/pocket_lm_eval/tasks/__init__.py b/apps/python/sampler/utils/pocket_lm_eval/tasks/__init__.py new file mode 100644 index 0000000..766d06c --- /dev/null +++ b/apps/python/sampler/utils/pocket_lm_eval/tasks/__init__.py @@ -0,0 +1,177 @@ +import collections +import logging +import os +from functools import partial +from typing import Dict, List, Mapping, Optional, Union + +from lm_eval import utils +from lm_eval.api.task import Task +from lm_eval.tasks import TaskManager +from utils.pocket_lm_eval.api.task import PocketNetworkConfigurableTask + + +class PocketNetworkTaskManager(TaskManager): + """PocketNetworkTaskManager indexes all tasks from the default `lm_eval/tasks/` + and an optional directory if provided. + + """ + + def __init__(self, verbosity="INFO", include_path: Optional[str] = None, pocket_args: Optional[Dict] = None) -> None: + self.verbosity = verbosity + self.include_path = include_path + self.pocket_args = pocket_args + self.logger = utils.eval_logger + self.logger.setLevel(getattr(logging, f"{verbosity}")) + + self._task_index = self.initialize_tasks(include_path=include_path) + self._all_tasks = sorted(list(self._task_index.keys())) + + self.task_group_map = collections.defaultdict(list) + + def _load_individual_task_or_group( + self, + name_or_config: Optional[Union[str, dict]] = None, + parent_name: Optional[str] = None, + update_config: Optional[dict] = None, + yaml_path: Optional[str] = None, + ) -> Mapping: + def load_task(config, task, group=None, yaml_path=None): + if "include" in config: + if yaml_path is None: + raise ValueError + config.update( + utils.load_yaml_config( + yaml_path, + yaml_config={"include": config.pop("include")}, + mode="full", + ) + ) + if self._config_is_python_task(config): + task_object = config["class"]() + else: + config = self._process_alias(config, group=group) + task_object = PocketNetworkConfigurableTask(config=config) + if group is not None: + task_object = (group, task_object) + return {task: task_object} + if isinstance(name_or_config, str): + if update_config is not None: + # Process name_or_config as a dict instead + name_or_config = {"task": name_or_config, **update_config} + elif self._name_is_task(name_or_config): + task_config = self._get_config(name_or_config) + ############################################################ + # START: POCKET NETWORK CODE + ############################################################ + if 'metadata' in task_config.keys(): + task_config['metadata'].update({'pocket_args':self.pocket_args[task_config['task']]}) + else: + task_config['metadata'] = {'pocket_args':self.pocket_args[task_config['task']]} + ############################################################ + # END: POCKET NETWORK CODE + ############################################################ + return load_task(task_config, task=name_or_config, group=parent_name) + else: + group_name = name_or_config + subtask_list = self._get_tasklist(name_or_config) + if subtask_list == -1: + group_config = self._get_config(name_or_config) + subtask_list = group_config["task"] + + # This checks if we're at the root. + if parent_name is None: + group_config = self._get_config(name_or_config) + if set(group_config.keys()) > {"task", "group"}: + update_config = { + k: v + for k, v in group_config.items() + if k not in ["task", "group"] + } + yaml_path = self._get_yaml_path(group_name) + + if (update_config is not None) and ("group_alias" in update_config): + group_name = update_config["group_alias"] + update_config.pop("group_alias") + + if isinstance(name_or_config, dict): + if update_config is not None: + name_or_config = { + **name_or_config, + **update_config, + } + + if self._config_is_task(name_or_config): + name = name_or_config["task"] + # If the name is registered as a group + # if self._name_is_task(name) is False: + if self._name_is_group(name): + group_name = name + update_config = { + k: v for k, v in name_or_config.items() if k != "task" + } + subtask_list = self._get_tasklist(name) + if subtask_list == -1: + subtask_list = self._get_config(name)["task"] + else: + if self._name_is_registered(name): + base_task_config = self._get_config(name) + + # Check if this is a duplicate. + if parent_name is not None: + name_or_config["group"] = parent_name + num_duplicate = len( + list( + filter( + lambda x: x.startswith(name), + self.task_group_map[parent_name], + ) + ) + ) + if num_duplicate > 0: + name = f"{name}-{num_duplicate}" + self.task_group_map[parent_name].append(name) + + task_config = { + **base_task_config, + **name_or_config, + } + else: + task_config = name_or_config + ############################################################ + # START: POCKET NETWORK CODE + ############################################################ + if 'metadata' in task_config.keys(): + task_config['metadata'].update({'pocket_args':self.pocket_args[task_config['task']]}) + else: + task_config['metadata'] = {'pocket_args':self.pocket_args[task_config['task']]} + ############################################################ + # END: POCKET NETWORK CODE + ############################################################ + return load_task( + task_config, task=name, group=parent_name, yaml_path=yaml_path + ) + else: + group_name = name_or_config["group"] + subtask_list = name_or_config["task"] + if set(name_or_config.keys()) > {"task", "group"}: + update_config = { + k: v + for k, v in name_or_config.items() + if k not in ["task", "group"] + } + + all_subtasks = {} + if parent_name is not None: + all_subtasks = {group_name: (parent_name, None)} + + fn = partial( + self._load_individual_task_or_group, + parent_name=group_name, + update_config=update_config, + yaml_path=yaml_path, + ) + all_subtasks = { + **all_subtasks, + **dict(collections.ChainMap(*map(fn, subtask_list))), + } + return all_subtasks \ No newline at end of file diff --git a/infrastructure/dev/.env b/infrastructure/dev/.env index 011cf16..7b4c79c 100644 --- a/infrastructure/dev/.env +++ b/infrastructure/dev/.env @@ -4,6 +4,7 @@ TEMPORAL_UI_VERSION=2.22.3 POSTGRESQL_VERSION=14.11-bookworm POSTGRES_PASSWORD=admin POSTGRES_USER=admin +POSTGRES_DB=lm-evaluation-harness PGADMIN_VERSION=8.4 PGADMIN_DEFAULT_EMAIL=admin@local.dev PGADMIN_DEFAULT_PASSWORD=admin diff --git a/infrastructure/dev/docker-compose.yaml b/infrastructure/dev/docker-compose.yaml index 22c8046..30f8acc 100644 --- a/infrastructure/dev/docker-compose.yaml +++ b/infrastructure/dev/docker-compose.yaml @@ -10,6 +10,7 @@ services: environment: POSTGRES_PASSWORD: ${POSTGRES_USER} POSTGRES_USER: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} networks: - temporal - postgresql diff --git a/infrastructure/dev/postgresql/.gitignore b/infrastructure/dev/postgresql/.gitignore new file mode 100644 index 0000000..6320cd2 --- /dev/null +++ b/infrastructure/dev/postgresql/.gitignore @@ -0,0 +1 @@ +data \ No newline at end of file diff --git a/infrastructure/dev/postgresql/servers.json b/infrastructure/dev/postgresql/servers.json index c274061..377cc94 100644 --- a/infrastructure/dev/postgresql/servers.json +++ b/infrastructure/dev/postgresql/servers.json @@ -5,7 +5,7 @@ "Group": "Servers", "Host": "postgresql", "Port": 5432, - "MaintenanceDB": "postgres", + "MaintenanceDB": "lm-evaluation-harness", "Username": "admin", "PassFile": "/pgadmin4/pgpass", "SSLMode": "prefer"