diff --git a/config/helm_tests.json b/config/helm_tests.json index aa9a745..52dc3fb 100644 --- a/config/helm_tests.json +++ b/config/helm_tests.json @@ -1,5 +1,440 @@ { - "LegalSupport": [ + "babi_qa_all": [ + { + "name": "babi_qa:task=all", + "metric": "quasi_exact_match", + "suffix" : "---", + "field": "mean", + "split": "test" + } + ], + "babi_qa_3": [ + { + "name": "babi_qa:task=3", + "metric": "quasi_exact_match", + "suffix" : "---", + "field": "mean", + "split": "test" + } + ], + "babi_qa_15": [ + { + "name": "babi_qa:task=15", + "metric": "quasi_exact_match", + "suffix" : "---", + "field": "mean", + "split": "test" + } + ], + "babi_qa_19": [ + { + "name": "babi_qa:task=19", + "metric": "quasi_exact_match", + "suffix" : "---", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_1": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=1", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_2": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=2", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_3": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=3", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_4": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=4", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_5": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=5", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_6": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=6", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_7": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=7", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_8": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=8", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_9": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=9", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_10": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=10", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_11": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=11", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_12": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=12", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_13": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=13", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_14": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=14", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_15": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=15", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_16": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=16", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_17": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=17", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_18": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=18", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_19": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=19", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "babi_qa_Task_20": [ + { + "suffix": "---", + "name": "babi_qa_subtask:compilation=20", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "blimp": [ + { + "name": "blimp:phenomenon=binding,method=multiple_choice_separate_original", + "suffix": "---", + "metric": "exact_match", + "field": "mean", + "split": "test" + }, + { + "name": "blimp:phenomenon=irregular_forms,method=multiple_choice_separate_original", + "suffix": "---", + "metric": "exact_match", + "field": "mean", + "split": "test" + }, + { + "name": "blimp:phenomenon=island_effects,method=multiple_choice_separate_original", + "suffix": "---", + "metric": "exact_match", + "field": "mean", + "split": "test" + }, + { + "name": "blimp:phenomenon=quantifiers,method=multiple_choice_separate_original", + "suffix": "---", + "metric": "exact_match", + "field": "mean", + "split": "test" + } + ], + "blimp_binding" : [ + { + "name" : "blimp:phenomenon=binding,method=multiple_choice_separate_original", + "metric": "exact_match", + "field" : "mean", + "split": "test", + "suffix": "---" + + } + ], + "blimp_irregular_forms" : [ + { + "name" : "blimp:phenomenon=irregular_forms,method=multiple_choice_separate_original", + "metric": "exact_match", + "field" : "mean", + "split": "test", + "suffix": "---" + } + ], + "blimp_island_effects" : [ + { + "name" : "blimp:phenomenon=island_effects,method=multiple_choice_separate_original", + "metric": "exact_match", + "field" : "mean", + "split": "test", + "suffix": "---" + } + ], + "blimp_quantifiers" : [ + { + "name" : "blimp:phenomenon=quantifiers,method=multiple_choice_separate_original", + "metric": "exact_match", + "field" : "mean", + "split": "test", + "suffix": "---" + } + ], + "boolq": [ + { + "name": "boolq", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", + "field": "mean", + "split": "test" + } + ], + "dyck_language": [ + { + "name": "dyck_language_np=3", + "suffix": "---", + "metric": "exact_match", + "field": "mean", + "split": "test" + } + ], + "entity_matching_abt_buy": [ + { + "name": "entity_matching:dataset=Abt_Buy", + "suffix": "---", + "metric": "exact_match", + "field": "mean", + "split": "test" + } + ], + "entity_matching_beer": [ + { + "name": "entity_matching:dataset=Beer", + "metric": "exact_match", + "suffix": "---", + "field": "mean", + "split": "test" + } + ], + "entity_matching_itunes": [ + { + "name": "entity_matching:dataset=Dirty_iTunes_Amazon", + "metric": "exact_match", + "suffix": "---", + "field": "mean", + "split": "test" + } + ], + "hellaswag": [ + { + "name": "commonsense:dataset=hellaswag", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", + "field": "mean", + "split": "valid" + } + ], + "ice": [ + { + "suffix" : "---", + "name": "ice:gender=female", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + }, + { + "suffix" : "---", + "name": "ice:gender=male", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + }, + { + "suffix" : "---", + "name": "ice:subset=ea", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + }, + { + "suffix" : "---", + "name": "ice:subset=hk", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + }, + { + "suffix" : "---", + "name": "ice:subset=ind", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + }, + { + "suffix" : "---", + "name": "ice:subset=usa", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + } + ], + "ice_female": [ + { + "suffix" : "---", + "name": "ice:gender=female", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + } + ], + "ice_male": [ + { + "suffix" : "---", + "name": "ice:gender=male", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + } + ], + "ice_ea": [ + { + "suffix" : "---", + "name": "ice:subset=ea", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + } + ], + "ice_hk": [ + { + "suffix" : "---", + "name": "ice:subset=hk", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + } + ], + "ice_ind": [ + { + "suffix" : "---", + "name": "ice:subset=ind", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + } + ], + "ice_usa": [ + { + "suffix" : "---", + "name": "ice:subset=usa", + "metric": "bits_per_byte", + "field": "mean", + "split": "test" + } + ], + "legal_support": [ { "name": "legal_support,method=multiple_choice_joint", "suffix" : "---", @@ -8,77 +443,129 @@ "split": "test" } ], - "Synthetic_reasoning_(natural_language)": [ + "lsat": [ + { + "name": "lsat_qa:task=all", + "metric": "quasi_exact_match", + "suffix": "---", + "field": "mean", + "split": "test" + } + ], + "mmlu": [ + { + "name": "mmlu:subject=abstract_algebra", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", + "field": "mean", + "split": "valid" + }, + { + "name": "mmlu:subject=college_chemistry", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", + "field": "mean", + "split": "valid" + }, + { + "name": "mmlu:subject=computer_security", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", + "field": "mean", + "split": "valid" + }, + { + "name": "mmlu:subject=econometrics", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", + "field": "mean", + "split": "valid" + }, + { + "name": "mmlu:subject=us_foreign_policy", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", + "field": "mean", + "split": "valid" + } + ], + "mmlu_abstract_algebra": [ { - "name": "synthetic_reasoning_natural:difficulty=hard", - "suffix" : "---", - "metric": "f1_set_match", + "name": "mmlu:subject=abstract_algebra", + "metric": "exact_match", "field": "mean", - "split": "test" - }, + "split": "valid" + } + ], + "mmlu_college_chemistry": [ { - "name": "synthetic_reasoning_natural:difficulty=easy", - "metric": "f1_set_match", - "suffix" : "---", + "name": "mmlu:subject=college_chemistry", + "metric": "exact_match", "field": "mean", - "split": "test" + "split": "valid" } ], - "Synthetic_reasoning_(abstract_symbols)---pattern_match": [ + "mmlu_computer_security": [ { - "name": "synthetic_reasoning:mode=pattern_match", - "suffix" : "---", - "metric": "quasi_exact_match", + "name": "mmlu:subject=computer_security", + "metric": "exact_match", "field": "mean", - "split": "test" + "split": "valid" } ], - "Synthetic_reasoning_(abstract_symbols)---variable_sustitution": [ + "mmlu_econometrics": [ { - "name": "synthetic_reasoning:mode=variable_substitution", - "suffix" : "---", - "metric": "quasi_exact_match", + "name": "mmlu:subject=econometrics", + "metric": "exact_match", "field": "mean", - "split": "test" + "split": "valid" } ], - "Synthetic_reasoning_(abstract_symbols)---induction": [ + "mmlu_us_foreign_policy": [ { - "name": "synthetic_reasoning:mode=induction", - "suffix" : "---", - "metric": "quasi_exact_match", + "name": "mmlu:subject=us_foreign_policy", + "metric": "exact_match", "field": "mean", - "split": "test" + "split": "valid" } ], - "bAbI": [ + "msmarco_regular": [ { - "name": "babi_qa:task=all", - "metric": "quasi_exact_match", - "suffix" : "---", + "name": "msmarco:track=regular,valid_topk=30", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", "field": "mean", "split": "test" } ], - "LSAT": [ + "msmarco_trec": [ { - "name": "lsat_qa:task=all", - "metric": "quasi_exact_match", - "suffix": "---", + "name": "msmarco:track=trec,valid_topk=30", + "suffix": "data_augmentation=canonical", + "metric": "exact_match", "field": "mean", "split": "test" } ], - "HellaSwag": [ + "natural_qa_closed": [ { - "name": "commonsense:dataset=hellaswag", - "suffix": "data_augmentation=canonical", + "name": "natural_qa:mode=closedbook", "metric": "exact_match", "field": "mean", + "suffix": "data_augmentation=canonical", "split": "valid" } ], - "OpenBookQA": [ + "natural_qa_open": [ + { + "name": "natural_qa:mode=openbook_longans", + "metric": "quasi_exact_match", + "field": "mean", + "suffix": "data_augmentation=canonical", + "split": "test" + } + ], + "openbookqa": [ { "name": "commonsense:dataset=openbookqa", "suffix": "data_augmentation=canonical", @@ -87,49 +574,100 @@ "split": "test" } ], - "MMLU": [ + "quac": [ { - "name": "mmlu:subject=abstract_algebra", - "suffix": "data_augmentation=canonical", + "name": "quac", "metric": "exact_match", "field": "mean", - "split": "valid" - }, - { - "name": "mmlu:subject=college_chemistry", "suffix": "data_augmentation=canonical", - "metric": "exact_match", + "split": "test" + } + ], + "synthetic_reasoning_nl": [ + { + "name": "synthetic_reasoning_natural:difficulty=hard", + "suffix" : "---", + "metric": "f1_set_match", "field": "mean", - "split": "valid" + "split": "test" }, { - "name": "mmlu:subject=computer_security", - "suffix": "data_augmentation=canonical", - "metric": "exact_match", + "name": "synthetic_reasoning_natural:difficulty=easy", + "metric": "f1_set_match", + "suffix" : "---", "field": "mean", - "split": "valid" + "split": "test" + } + ], + "synthetic_reasoning_nl_easy": [ + { + "name": "synthetic_reasoning_natural:difficulty=hard", + "suffix" : "---", + "metric": "f1_set_match", + "field": "mean", + "split": "test" + } + ], + "synthetic_reasoning_nl_hard": [ + { + "name": "synthetic_reasoning_natural:difficulty=easy", + "metric": "f1_set_match", + "suffix" : "---", + "field": "mean", + "split": "test" + } + ], + "synthetic_reasoning_abstract": [ + { + "name": "synthetic_reasoning:mode=pattern_match", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test", + "suffix" : "---" }, { - "name": "mmlu:subject=econometrics", - "suffix": "data_augmentation=canonical", - "metric": "exact_match", + "name": "synthetic_reasoning:mode=variable_substitution", + "metric": "quasi_exact_match", "field": "mean", - "split": "valid" + "split": "test", + "suffix" : "---" }, { - "name": "mmlu:subject=us_foreign_policy", - "suffix": "data_augmentation=canonical", - "metric": "exact_match", + "name": "synthetic_reasoning:mode=induction", + "metric": "quasi_exact_match", "field": "mean", - "split": "valid" + "split": "test", + "suffix" : "---" } ], - "WikiText-103": [ + "synthetic_reasoning_pattern_match": [ { - "name": "IGNORE-ME" + "name": "synthetic_reasoning:mode=pattern_match", + "suffix" : "---", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "synthetic_reasoning_variable_substitution": [ + { + "name": "synthetic_reasoning:mode=variable_substitution", + "suffix" : "---", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" + } + ], + "synthetic_reasoning_induction": [ + { + "name": "synthetic_reasoning:mode=induction", + "suffix" : "---", + "metric": "quasi_exact_match", + "field": "mean", + "split": "test" } ], - "The Pile": [ + "the_pile": [ { "name": "the_pile:subset=ArXiv", "metric": "bits_per_byte", @@ -167,100 +705,191 @@ "split": "test" } ], - "TwitterAAE": [ + "the_pile_arxiv" : [ { - "name": "twitter_aae:demographic=white", + "name" : "the_pile:subset=ArXiv", "metric": "bits_per_byte", - "field": "mean", + "field" : "mean", "split": "test" - }, + } + ], + "the_pile_bookcorpus2" : [ { - "name": "twitter_aae:demographic=aa", + "name" : "the_pile:subset=BookCorpus2", "metric": "bits_per_byte", - "field": "mean", + "field" : "mean", "split": "test" } ], - "ICE": [ + "the_pile_enron" : [ { - "name": "ice:gender=female", + "name" : "the_pile:subset=Enron Emails", "metric": "bits_per_byte", - "field": "mean", + "field" : "mean", "split": "test" - }, + } + ], + "the_pile_github" : [ { - "name": "ice:gender=male", + "name" : "the_pile:subset=Github", "metric": "bits_per_byte", - "field": "mean", + "field" : "mean", "split": "test" - }, + } + ], + "the_pile_pubmed" : [ { - "name": "ice:subset=ea", + "name" : "the_pile:subset=PubMed Central", + "metric": "bits_per_byte", + "field" : "mean", + "split": "test" + } + ], + "the_pile_wikipedia" : [ + { + "name" : "the_pile:subset=Wikipedia (en)", + "metric": "bits_per_byte", + "field" : "mean", + "split": "test" + } + ], + "truthful_qa": [ + { + "name": "truthful_qa:task=mc_single,method=multiple_choice_joint", + "suffix" : "data_augmentation=canonical", + "metric": "exact_match", + "field": "mean", + "split": "valid" + } + ], + "twitter_aae": [ + { + "name": "twitter_aae:demographic=white", "metric": "bits_per_byte", "field": "mean", "split": "test" }, { - "name": "ice:subset=hk", + "name": "twitter_aae:demographic=aa", "metric": "bits_per_byte", "field": "mean", "split": "test" - }, + } + ], + "twitter_aae_white": [ { - "name": "ice:subset=ind", + "name": "twitter_aae:demographic=white", "metric": "bits_per_byte", "field": "mean", "split": "test" - }, + } + ], + "twitter_aae_aa": [ { - "name": "ice:subset=usa", + "name": "twitter_aae:demographic=aa", "metric": "bits_per_byte", "field": "mean", "split": "test" } ], - "WikiData": [ + "wikidata": [ { "name": "IGNORE-ME" } ], - "BLiMP": [ + "wikifact_author": [ { - "name": "blimp:phenomenon=binding,method=multiple_choice_separate_original", + "name": "wikifact:k=5,subject=author", + "metric": "exact_match", + "field": "mean", "suffix": "---", + "split": "test" + } + ], + "wikifact_currency": [ + { + "name": "wikifact:k=5,subject=currency", "metric": "exact_match", "field": "mean", + "suffix": "---", "split": "test" - }, + } + ], + "wikifact_discoverer_or_inventor": [ { - "name": "blimp:phenomenon=irregular_forms,method=multiple_choice_separate_original", + "name": "wikifact:k=5,subject=discoverer_or_inventor", + "metric": "exact_match", "suffix": "---", + "field": "mean", + "split": "test" + } + ], + "wikifact_instance_of": [ + { + "name": "wikifact:k=5,subject=instance_of", "metric": "exact_match", "field": "mean", + "suffix": "---", "split": "test" - }, + } + ], + "wikifact_medical_condition_treated": [ { - "name": "blimp:phenomenon=island_effects,method=multiple_choice_separate_original", + "name": "wikifact:k=5,subject=medical_condition_treated", + "metric": "exact_match", + "field": "mean", + "suffix": "---", + "split": "test" + } + ], + "wikifact_part_of": [ + { + "name": "wikifact:k=5,subject=part_of", + "metric": "exact_match", + "field": "mean", "suffix": "---", + "split": "test" + } + ], + "wikifact_place_of_birth": [ + { + "name": "wikifact:k=5,subject=place_of_birth", "metric": "exact_match", "field": "mean", + "suffix": "---", "split": "test" - }, + } + ], + "wikifact_plaintiff": [ { - "name": "blimp:phenomenon=quantifiers,method=multiple_choice_separate_original", + "name": "wikifact:k=5,subject=plaintiff", + "metric": "exact_match", "suffix": "---", + "field": "mean", + "split": "test" + } + ], + "wikifact_position_held": [ + { + "name": "wikifact:k=5,subject=position_held", "metric": "exact_match", "field": "mean", + "suffix": "---", "split": "test" } ], - "truthful_qa": [ + "wikifact_symptoms_and_signs": [ { - "name": "truthful_qa:task=mc_single,method=multiple_choice_joint", - "suffix" : "data_augmentation=canonical", + "name": "wikifact:k=5,subject=symptoms_and_signs", "metric": "exact_match", + "suffix": "---", "field": "mean", - "split": "valid" + "split": "test" + } + ], + "wikitext-103": [ + { + "name": "IGNORE-ME" } ] -} \ No newline at end of file +}