diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000..62f866af40 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,18 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - uses: pre-commit/action@v3.0.1 \ No newline at end of file diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml index 11a6151d7c..593af0335e 100644 --- a/.github/workflows/run_tests.yaml +++ b/.github/workflows/run_tests.yaml @@ -10,6 +10,7 @@ on: jobs: check_files: + name: Unit tests runs-on: ubuntu-latest steps: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f651895b60..ab1c9b71df 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,8 @@ repos: rev: 'v1.3.0' hooks: - id: mypy + additional_dependencies: + - typed-ast args: ["--config-file=mypy.ini", "--no-site-packages"] - repo: https://github.com/psf/black diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py index e48a09ac19..3c69c9811b 100644 --- a/evals/cli/oaieval.py +++ b/evals/cli/oaieval.py @@ -5,7 +5,7 @@ import logging import shlex import sys -from typing import Any, Mapping, Optional, Union, cast +from typing import Any, Optional, Union, cast import evals import evals.api @@ -135,7 +135,7 @@ def run(args: OaiEvalArguments, registry: Optional[Registry] = None) -> str: def parse_extra_eval_params( param_str: Optional[str], - ) -> Mapping[str, Union[str, int, float]]: + ) -> dict[str, Union[str, int, float]]: """Parse a string of the form "key1=value1,key2=value2" into a dict.""" if not param_str: return {} @@ -227,7 +227,9 @@ def to_number(x: str) -> Union[int, float, str]: try: add_token_usage_to_result(result, recorder) except Exception as e: - logger.error(f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected.") + logger.error( + f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected." + ) recorder.record_final_report(result) if not (args.dry_run or args.local_run): diff --git a/evals/elsuite/already_said_that/scripts/gen_data.py b/evals/elsuite/already_said_that/scripts/gen_data.py index 94f827d0d6..ba2c32b8a0 100644 --- a/evals/elsuite/already_said_that/scripts/gen_data.py +++ b/evals/elsuite/already_said_that/scripts/gen_data.py @@ -1,7 +1,7 @@ import argparse +import json import os import random -import json import nltk from nltk.corpus import wordnet @@ -60,13 +60,9 @@ def main(args: argparse.Namespace): parser = argparse.ArgumentParser() parser.add_argument("--n_samples", type=int, default=500) - parser.add_argument( - "--n_words", type=int, default=100, help="Number of words in each sample" - ) + parser.add_argument("--n_words", type=int, default=100, help="Number of words in each sample") parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--jsonl_dir", type=str, default="./evals/registry/data/already_said_that/" - ) + parser.add_argument("--jsonl_dir", type=str, default="./evals/registry/data/already_said_that/") args = parser.parse_args() diff --git a/evals/elsuite/already_said_that/scripts/make_plots.py b/evals/elsuite/already_said_that/scripts/make_plots.py index ede36291ec..0bcf1575a4 100644 --- a/evals/elsuite/already_said_that/scripts/make_plots.py +++ b/evals/elsuite/already_said_that/scripts/make_plots.py @@ -1,11 +1,11 @@ -from pathlib import Path import argparse import json +from pathlib import Path -from tqdm.auto import tqdm -import numpy as np import matplotlib.pyplot as plt +import numpy as np import seaborn as sns +from tqdm.auto import tqdm from evals.utils import log_utils @@ -98,9 +98,7 @@ def make_results_dict(log_dir: Path) -> dict: def prepare_results_dict() -> dict: results_dict = { stat: { - distractor: { - model: {"raw": [], "mean": 0, "std_err": 0} for model in MODELS - } + distractor: {model: {"raw": [], "mean": 0, "std_err": 0} for model in MODELS} for distractor in DISTRACTORS } for stat in [ @@ -136,9 +134,9 @@ def fill_results_dict(results_dict: dict, log_dir: Path) -> dict: for stat in results_dict: data_points = results_dict[stat][distractor][model]["raw"] results_dict[stat][distractor][model]["mean"] = np.mean(data_points) - results_dict[stat][distractor][model]["std_err"] = np.std( - data_points - ) / np.sqrt(NUM_REPEATS) + results_dict[stat][distractor][model]["std_err"] = np.std(data_points) / np.sqrt( + NUM_REPEATS + ) return results_dict @@ -205,9 +203,7 @@ def make_bar_plot(results_dict: dict, stat: str, save_path: Path): legend_indices = list(range(len(distractors)))[::-1] means = [[data[dis][model]["mean"] for dis in distractors] for model in models] - std_errs = [ - [data[dis][model]["std_err"] for dis in distractors] for model in models - ] + std_errs = [[data[dis][model]["std_err"] for dis in distractors] for model in models] cmap = plt.get_cmap("Set3") colors = np.array([cmap(i) for i in range(len(distractors))]) @@ -282,15 +278,9 @@ def count_tokens(log_dir) -> dict[str, dict[str, dict[str, int]]]: samplings = log_utils.extract_individual_results(log, "sampling") for sampling in samplings: usage = sampling["usage"] - token_counts[model][distractor]["input"] += zero_if_none( - usage["prompt_tokens"] - ) - token_counts[model][distractor]["output"] += zero_if_none( - usage["completion_tokens"] - ) - token_counts[model][distractor]["total"] += zero_if_none( - usage["total_tokens"] - ) + token_counts[model][distractor]["input"] += zero_if_none(usage["prompt_tokens"]) + token_counts[model][distractor]["output"] += zero_if_none(usage["completion_tokens"]) + token_counts[model][distractor]["total"] += zero_if_none(usage["total_tokens"]) return token_counts @@ -318,11 +308,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "--log_dir", type=str, required=True, help="Where the logs are stored" - ) - parser.add_argument( - "--save_dir", type=str, required=True, help="Where to save the plots" - ) + parser.add_argument("--log_dir", type=str, required=True, help="Where the logs are stored") + parser.add_argument("--save_dir", type=str, required=True, help="Where to save the plots") args = parser.parse_args() main(args) diff --git a/evals/elsuite/cant_do_that_anymore/eval.py b/evals/elsuite/cant_do_that_anymore/eval.py index 0ca6df5b0b..017519fd4c 100644 --- a/evals/elsuite/cant_do_that_anymore/eval.py +++ b/evals/elsuite/cant_do_that_anymore/eval.py @@ -137,7 +137,9 @@ def run(self, recorder: RecorderBase) -> dict[str, Union[float, int]]: std_num_previous_moves = np.std([i["num_previous_moves"] for i in metrics]) delta = predicted_move_in_variant_proportion - predicted_move_proportion - variant_impact_factor = (delta / predicted_move_proportion) if predicted_move_proportion != 0 else 0 + variant_impact_factor = ( + (delta / predicted_move_proportion) if predicted_move_proportion != 0 else 0 + ) results = { "variant_impact_factor": variant_impact_factor, diff --git a/evals/elsuite/function_deduction/scripts/dataset/create_dataset.py b/evals/elsuite/function_deduction/scripts/dataset/create_dataset.py index 931e1cc27a..b902f6b7f4 100644 --- a/evals/elsuite/function_deduction/scripts/dataset/create_dataset.py +++ b/evals/elsuite/function_deduction/scripts/dataset/create_dataset.py @@ -11,10 +11,12 @@ def get_func_from_code(code): def get_complexity(code: str) -> int: - # NOTE: this is quite ugly, but should be good enough for dataset-creating code - code = "global func_name\ndef func_name(x): return " + code - exec(code) - return len(list(dis.get_instructions(func_name))) + # NOTE: ugly but good enough for dataset-creating code + src = f"def _tmp(x): return {code}" + ns: dict[str, object] = {} + exec(src, {"math": math}, ns) # define function into ns + fn = ns["_tmp"] # retrieve it + return len(list(dis.get_instructions(fn))) def create_dataset(out_file, in_file): diff --git a/evals/elsuite/function_deduction/scripts/make_plots.py b/evals/elsuite/function_deduction/scripts/make_plots.py index 4c8f5f5e78..dbd5caa488 100644 --- a/evals/elsuite/function_deduction/scripts/make_plots.py +++ b/evals/elsuite/function_deduction/scripts/make_plots.py @@ -11,27 +11,19 @@ "Average Baseline": "blue", "Full Knowledge Best": "blue", "Full Knowledge Random": "blue", - "Human": "steelblue", - "gpt-4-32k": "purple", "gpt-4-32k w CoT": "purple", - "gpt-4-base w Few-shot": "orange", "gpt-4-base w CoT and Few-shot": "orange", - "gpt-3.5-turbo-16k": "green", "gpt-3.5-turbo-16k w CoT": "green", - "gemini-pro": "peru", "gemini-pro w CoT": "peru", - "llama-2-13b-chat": "brown", "llama-2-13b-chat w CoT": "brown", - "llama-2-70b-chat": "maroon", "llama-2-70b-chat w CoT": "maroon", - "mixtral-8x7b-instruct": "grey", "mixtral-8x7b-instruct w CoT": "grey", } @@ -40,27 +32,19 @@ "function_deduction/full_knowledge_best": "Full Knowledge Best", "function_deduction/full_knowledge_random": "Full Knowledge Random", "function_deduction/average_baseline": "Average Baseline", - "human_cli": "Human", - "gpt-4-32k": "gpt-4-32k", "function_deduction/cot/gpt-4-32k": "gpt-4-32k w CoT", - "function_deduction/gpt-4-base": "gpt-4-base w Few-shot", "function_deduction/cot/gpt-4-base": "gpt-4-base w CoT and Few-shot", - "gpt-3.5-turbo-16k": "gpt-3.5-turbo-16k", "function_deduction/cot/gpt-3.5-turbo-16k": "gpt-3.5-turbo-16k w CoT", - "generation/direct/gemini-pro": "gemini-pro", "function_deduction/cot/gemini-pro": "gemini-pro w CoT", - "generation/direct/llama-2-13b-chat": "llama-2-13b-chat", "function_deduction/cot/llama-2-13b-chat": "llama-2-13b-chat w CoT", - "generation/direct/llama-2-70b-chat": "llama-2-70b-chat", "function_deduction/cot/llama-2-70b-chat": "llama-2-70b-chat w CoT", - "generation/direct/mixtral-8x7b-instruct": "mixtral-8x7b-instruct", "function_deduction/cot/mixtral-8x7b-instruct": "mixtral-8x7b-instruct w CoT", } diff --git a/evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/naive.py b/evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/naive.py index e7578d8194..c923b7d321 100644 --- a/evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/naive.py +++ b/evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/naive.py @@ -9,7 +9,7 @@ class Agent: def __init__(self, env): # Set the action space seed so sampling from it is deterministic env.action_space.seed(episode) - + self.env = env def act(self, observation): diff --git a/evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/grade.py b/evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/grade.py index c4c05b0233..b35a04b242 100644 --- a/evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/grade.py +++ b/evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/grade.py @@ -1,17 +1,12 @@ import logging -import shutil from functools import cache from pathlib import Path -from tempfile import TemporaryDirectory import pandas as pd from torchvision import datasets -import evals.elsuite.hr_ml_agent_bench.benchmarks.cifar10.env.train as baseline_script -from evals.elsuite.hr_ml_agent_bench.low_level_actions import execute_script from evals.elsuite.hr_ml_agent_bench.utils import get_baseline_score - logger = logging.getLogger(__name__) @@ -54,7 +49,7 @@ def get_naive_baseline_score() -> float: Executes the baseline script `train.py` and returns the accuracy. Expects the predictions to be saved to `submission.csv` when run. """ - + scripts_dir = Path(__file__).parent env_dir = scripts_dir.parent / "env" naive_baseline = env_dir / "train.py" diff --git a/evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/naive.py b/evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/naive.py index b7ca4c9de0..19d667c3f5 100644 --- a/evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/naive.py +++ b/evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/naive.py @@ -9,7 +9,7 @@ class Agent: def __init__(self, env): # Set the action space seed so sampling from it is deterministic env.action_space.seed(episode) - + self.env = env def act(self, observation): diff --git a/evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/naive.py b/evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/naive.py index 4f9f158d9c..1811027b81 100644 --- a/evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/naive.py +++ b/evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/naive.py @@ -9,7 +9,7 @@ class Agent: def __init__(self, env): # Set the action space seed so sampling from it is deterministic env.action_space.seed(episode) - + self.env = env def act(self, observation): diff --git a/evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/grade.py b/evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/grade.py index 979a5ffa81..30f93345a6 100644 --- a/evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/grade.py +++ b/evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/grade.py @@ -68,7 +68,9 @@ def get_naive_baseline_score() -> float: """ env_dir = Path(__file__).parent / ".." / "env" - dataset_dir = get_root_dir() / "registry" / "data" / "hr_ml_agent_bench" / "ogbn_arxiv" / "dataset" + dataset_dir = ( + get_root_dir() / "registry" / "data" / "hr_ml_agent_bench" / "ogbn_arxiv" / "dataset" + ) with TemporaryDirectory() as tmp_dir: dst_dir = Path(tmp_dir) / "env" diff --git a/evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/naive.py b/evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/naive.py index cbad322131..ff2c1e9a97 100644 --- a/evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/naive.py +++ b/evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/naive.py @@ -9,7 +9,7 @@ class Agent: def __init__(self, env): # Set the action space seed so sampling from it is deterministic env.action_space.seed(episode) - + self.env = env def act(self, observation): diff --git a/evals/elsuite/hr_ml_agent_bench/high_level_actions.py b/evals/elsuite/hr_ml_agent_bench/high_level_actions.py index 8383376367..5881829ea0 100644 --- a/evals/elsuite/hr_ml_agent_bench/high_level_actions.py +++ b/evals/elsuite/hr_ml_agent_bench/high_level_actions.py @@ -76,7 +76,7 @@ def edit_script( # TODO: handle long file editing try: content = read_file(script_name, work_dir=work_dir, **kwargs) - except: + except EnvException: write_file(script_name, "", work_dir=work_dir, **kwargs) content = "" @@ -135,12 +135,12 @@ def edit_script_lines( try: start_line_number = int(start_line_number) end_line_number = int(end_line_number) - except: + except (TypeError, ValueError): raise EnvException("start_line_number and end_line_number must be integers") try: orig_content = read_file(script_name, work_dir=work_dir, **kwargs) - except: + except EnvException: write_file(script_name, "", work_dir=work_dir, **kwargs) orig_content = "" lines = orig_content.split("\n") @@ -197,14 +197,12 @@ def inspect_script_lines(script_name, start_line_number, end_line_number, work_d try: start_line_number = int(start_line_number) end_line_number = int(end_line_number) - except: + except (TypeError, ValueError): raise EnvException("start_line_number and end_line_number must be integers") - if end_line_number - start_line_number > 100: - raise EnvException("the number of lines to display is limited to 100 lines") + try: - # lines = open(os.path.join(work_dir,script_name)).readlines() lines = read_file(script_name, work_dir=work_dir, **kwargs).split("\n") - except: + except EnvException: raise EnvException(f"cannot find script {script_name}") content = "\n".join(lines[max(int(start_line_number) - 1, 0) : int(end_line_number)]) diff --git a/evals/elsuite/hr_ml_agent_bench/low_level_actions.py b/evals/elsuite/hr_ml_agent_bench/low_level_actions.py index 10ab2c93c1..37062aa600 100644 --- a/evals/elsuite/hr_ml_agent_bench/low_level_actions.py +++ b/evals/elsuite/hr_ml_agent_bench/low_level_actions.py @@ -119,8 +119,13 @@ def list_files(dir_path, work_dir=".", **kwargs): ["ls", "-F", os.path.join(work_dir, dir_path)] ).decode("utf-8") return observation - except: - raise EnvException(f"Cannot list file in the {dir_path} directory") + except ( + subprocess.CalledProcessError, + FileNotFoundError, + NotADirectoryError, + PermissionError, + ) as e: + raise EnvException(f"Cannot list file in the {dir_path} directory: {e}") @check_file_in_work_dir(["file_name"]) @@ -129,8 +134,8 @@ def read_file(file_name, work_dir=".", **kwargs): try: observation = open(os.path.join(work_dir, file_name)).read() return observation - except: - raise EnvException(f"cannot read file {file_name}") + except (FileNotFoundError, IsADirectoryError, PermissionError, OSError) as e: + raise EnvException(f"cannot read file {file_name}: {e}") @check_file_in_work_dir(["file_name"]) @@ -142,8 +147,8 @@ def write_file(file_name, content, work_dir=".", **kwargs): f.write(content) observation = f"File {file_name} written successfully." return observation - except: - raise EnvException(f"cannot write file {file_name}") + except (IsADirectoryError, PermissionError, OSError) as e: + raise EnvException(f"cannot write file {file_name}: {e}") @check_file_in_work_dir(["file_name"]) @@ -155,8 +160,8 @@ def append_file(file_name, content, work_dir=".", **kwargs): f.write(content) observation = f"File {file_name} appended successfully." return observation - except: - raise EnvException(f"cannot append file {file_name}") + except (IsADirectoryError, PermissionError, OSError) as e: + raise EnvException(f"cannot append file {file_name}: {e}") @check_file_in_work_dir(["source", "destination"]) @@ -167,9 +172,15 @@ def copy_file(source, destination, work_dir=".", **kwargs): shutil.copyfile(os.path.join(work_dir, source), os.path.join(work_dir, destination)) observation = f"File {source} copied to {destination}" return observation - except: + except ( + FileNotFoundError, + IsADirectoryError, + PermissionError, + shutil.SameFileError, + OSError, + ) as e: raise EnvException( - f"File {source} copy to {destination} failed. Check whether the source and destinations are valid." + f"File {source} copy to {destination} failed. Check whether the source and destinations are valid. {e}" ) @@ -189,9 +200,9 @@ def undo_edit_script(script_name, work_dir=".", **kwargs): new_content = open(os.path.join(work_dir, script_name)).read() observation = f"Content of {script_name} after undo the most recent edit:\n" + new_content return observation - except: + except (FileNotFoundError, IsADirectoryError, PermissionError, OSError, shutil.Error) as e: raise EnvException( - f"Cannot undo the edit of file name {script_name}. Check the file name again." + f"Cannot undo the edit of file name {script_name}. Check the file name again. {e}" ) diff --git a/evals/elsuite/hr_ml_agent_bench/scripts/plot_experiments.py b/evals/elsuite/hr_ml_agent_bench/scripts/plot_experiments.py index 1e849dca91..94026faaf3 100644 --- a/evals/elsuite/hr_ml_agent_bench/scripts/plot_experiments.py +++ b/evals/elsuite/hr_ml_agent_bench/scripts/plot_experiments.py @@ -1,7 +1,7 @@ # %% -import os import json +import os import textwrap import matplotlib.lines as mlines diff --git a/evals/elsuite/identifying_variables/renderers/__init__.py b/evals/elsuite/identifying_variables/renderers/__init__.py index c155624761..b10723d50b 100644 --- a/evals/elsuite/identifying_variables/renderers/__init__.py +++ b/evals/elsuite/identifying_variables/renderers/__init__.py @@ -1,5 +1,4 @@ -from . import tabular -from . import corrset +from . import corrset, tabular RENDERER_MAP = { "markdown": tabular.MarkdownTableRenderer, diff --git a/evals/elsuite/identifying_variables/renderers/corrset.py b/evals/elsuite/identifying_variables/renderers/corrset.py index 39563527a6..e04f3b413e 100644 --- a/evals/elsuite/identifying_variables/renderers/corrset.py +++ b/evals/elsuite/identifying_variables/renderers/corrset.py @@ -1,10 +1,10 @@ from typing import List, Set, Tuple -from evals.elsuite.identifying_variables.structs import Sample -from evals.elsuite.identifying_variables.renderers.base import RendererBase import evals.elsuite.identifying_variables.graph_utils as graph_utils import evals.elsuite.identifying_variables.renderers.templates as templates from evals.elsuite.identifying_variables.constants import SPARSITY_FOR_UNOBS +from evals.elsuite.identifying_variables.renderers.base import RendererBase +from evals.elsuite.identifying_variables.structs import Sample class CorrSetRenderer(RendererBase): @@ -35,8 +35,7 @@ def determine_sample_type(self, sample: Sample) -> Tuple[str, List[Set[str]]]: unobserved_vars = set( var for var in sample.variable_metadata - if sample.variable_metadata[var]["extra"]["sparsity_rate"] - > SPARSITY_FOR_UNOBS + if sample.variable_metadata[var]["extra"]["sparsity_rate"] > SPARSITY_FOR_UNOBS ) for tree in graph_trees: correl_set = set(tree) @@ -69,8 +68,7 @@ def _get_hypd_unobserved_vars(self, sample: Sample) -> List[str]: vars_to_mention = [ var for var in hypothesized_vars - if sample.variable_metadata[var]["extra"]["sparsity_rate"] - > SPARSITY_FOR_UNOBS + if sample.variable_metadata[var]["extra"]["sparsity_rate"] > SPARSITY_FOR_UNOBS ] return vars_to_mention @@ -191,9 +189,7 @@ def render_many_sets(self, correl_sets: List[Set[str]]): transition_phrase = "" current_set_idx = correl_set_idx - mentioned_vars_from_set = correl_set_idx_to_already_mentioned_vars[ - correl_set_idx - ] + mentioned_vars_from_set = correl_set_idx_to_already_mentioned_vars[correl_set_idx] if len(mentioned_vars_from_set) == 0: # first time mentioning this set mention_string = templates.IND_VARS_EXAMPLE.format( optional_transition=transition_phrase, @@ -319,6 +315,7 @@ def mention_unobserved_vars(self, sample: Sample) -> str: if __name__ == "__main__": import random + import numpy as np list_of_lists = [ @@ -332,10 +329,12 @@ def mention_unobserved_vars(self, sample: Sample) -> str: np_rng = np.random.default_rng(0) renderer = PureCorrSetRenderer(random.Random(0), np_rng) - from evals.elsuite.identifying_variables.scripts.gen_data import gen_samples - import networkx as nx from pprint import pprint + import networkx as nx + + from evals.elsuite.identifying_variables.scripts.gen_data import gen_samples + samples = gen_samples(10, None, np_rng) for sample in samples: diff --git a/evals/elsuite/identifying_variables/renderers/tabular.py b/evals/elsuite/identifying_variables/renderers/tabular.py index 0feb8b38fe..cab5cdd6e7 100644 --- a/evals/elsuite/identifying_variables/renderers/tabular.py +++ b/evals/elsuite/identifying_variables/renderers/tabular.py @@ -1,18 +1,15 @@ -from typing import Optional, Tuple, Union, List import json import random +from typing import List, Optional, Tuple, Union import networkx as nx import numpy as np import pandas as pd -from evals.elsuite.identifying_variables.structs import Sample -from evals.elsuite.identifying_variables.renderers.base import RendererBase -from evals.elsuite.identifying_variables.latent_funcs import ( - DISTRIBUTIONS, - LATENT_FUNC_MAP, -) from evals.elsuite.identifying_variables.constants import NUM_OBS +from evals.elsuite.identifying_variables.latent_funcs import DISTRIBUTIONS, LATENT_FUNC_MAP +from evals.elsuite.identifying_variables.renderers.base import RendererBase +from evals.elsuite.identifying_variables.structs import Sample def apply_noise( @@ -77,9 +74,7 @@ def _render_table(self, sample: Sample) -> pd.DataFrame: if "input_x" not in variable_metadata[var]["gen_method"]: distr = DISTRIBUTIONS[gen_method] distr_kwargs = variable_metadata[var]["gen_method"]["kwargs"] - data_dict[var] = distr( - num_samples=n_obs_samples, **distr_kwargs, rng=self.np_rng - ) + data_dict[var] = distr(num_samples=n_obs_samples, **distr_kwargs, rng=self.np_rng) else: latent_func = LATENT_FUNC_MAP[gen_method] latent_func_kwargs = variable_metadata[var]["gen_method"]["kwargs"] @@ -170,9 +165,7 @@ def _render_row( if np.isnan(value): string += f"{var} was not {past_participle_verb}. " else: - string += ( - f"{var} was {past_participle_verb} to be {format_number(value)}. " - ) + string += f"{var} was {past_participle_verb} to be {format_number(value)}. " return string diff --git a/evals/elsuite/identifying_variables/scripts/gen_data.py b/evals/elsuite/identifying_variables/scripts/gen_data.py index 14c5f78e28..eca4763a4d 100644 --- a/evals/elsuite/identifying_variables/scripts/gen_data.py +++ b/evals/elsuite/identifying_variables/scripts/gen_data.py @@ -4,28 +4,28 @@ Use default argparse args to replicate the dataset used for the report """ -from dataclasses import asdict -import os import argparse -from typing import Dict, List, Optional, Set, Tuple, Any -import json import copy +import json +import os +from dataclasses import asdict +from typing import Any, Dict, List, Optional, Set, Tuple -from tqdm.auto import tqdm import networkx as nx import numpy as np +from tqdm.auto import tqdm +import evals.elsuite.identifying_variables.constants as constants import evals.elsuite.identifying_variables.latent_funcs as latent_funcs from evals.elsuite.identifying_variables.graph_utils import ( - gen_random_forest, - gen_random_forest_tree_size, + find_connected_nodes_pair, find_graph_roots, find_unconnected_nodes_pair, - find_connected_nodes_pair, + gen_random_forest, + gen_random_forest_tree_size, ) +from evals.elsuite.identifying_variables.structs import Answer, Sample from evals.elsuite.identifying_variables.utils import sample_serializer -from evals.elsuite.identifying_variables.structs import Sample, Answer -import evals.elsuite.identifying_variables.constants as constants def write_to_jsonl( @@ -37,9 +37,7 @@ def write_to_jsonl( f.write(json.dumps(asdict(sample), default=sample_serializer) + "\n") -def random_latent_func_meta( - np_rng: np.random.Generator, input_x: Optional[str] = None -) -> Dict: +def random_latent_func_meta(np_rng: np.random.Generator, input_x: Optional[str] = None) -> Dict: """ Generates random metadata for defining a latent function @@ -86,9 +84,7 @@ def build_var_metadata( roots = find_graph_roots(causal_graph) root_to_descendants = {r: nx.descendants(causal_graph, r) for r in roots} node_to_root = { - n: root - for root, descendants in root_to_descendants.items() - for n in descendants + n: root for root, descendants in root_to_descendants.items() for n in descendants } for var in causal_graph: @@ -123,9 +119,7 @@ def sparsify_data(var_metadata, sparse_var_rate, np_rng): orig_var_metadata = copy.deepcopy(var_metadata) for var in var_metadata.keys(): if np_rng.uniform(0, 1) < sparse_var_rate: - sparsity_rate = np_rng.uniform( - low=constants.MIN_SPARSITY, high=constants.MAX_SPARSITY - ) + sparsity_rate = np_rng.uniform(low=constants.MIN_SPARSITY, high=constants.MAX_SPARSITY) var_metadata[var]["extra"]["sparsity_rate"] = sparsity_rate if sparsity_rate > constants.SPARSITY_FOR_UNOBS: # remove unobserved variables from correlations @@ -168,9 +162,7 @@ def gen_sample_balanced_ctrl_vars( low=constants.MIN_SPARSE_VAR_RATE, high=constants.MAX_SPARSE_VAR_RATE ) # perc of variables to sparsify - var_ids = np_rng.choice(np.arange(1000, 10000), size=n_vars, replace=False).astype( - str - ) + var_ids = np_rng.choice(np.arange(1000, 10000), size=n_vars, replace=False).astype(str) var_names = [f"x_{var_id}" for var_id in var_ids] num_ctrl_vars = np_rng.integers(low=0, high=n_vars - 1) # high is exclusive @@ -241,9 +233,7 @@ def gen_sample( low=constants.MIN_SPARSE_VAR_RATE, high=constants.MAX_SPARSE_VAR_RATE ) # perc of variables to sparsify - var_ids = np_rng.choice(np.arange(1000, 10000), size=n_vars, replace=False).astype( - str - ) + var_ids = np_rng.choice(np.arange(1000, 10000), size=n_vars, replace=False).astype(str) var_names = [f"x_{var_id}" for var_id in var_ids] causal_graph = gen_random_forest(var_names, np_rng=np_rng) @@ -291,9 +281,7 @@ def gen_sample( ) -def determine_gold_label( - target_hyp, variable_metadata, hypotheses -) -> Tuple[Answer, Optional[int]]: +def determine_gold_label(target_hyp, variable_metadata, hypotheses) -> Tuple[Answer, Optional[int]]: """ Determines the ideal `Answer` for a given sample. Additionally returns the number of variables not controlled for, if the hypothesis is valid, @@ -304,9 +292,7 @@ def determine_gold_label( ctrl_vars, not_ctrls = None, None num_not_ctrl = None else: - ctrl_vars, not_ctrls = determine_ctrl_vars( - variable_metadata, ind_var, dep_var, hypotheses - ) + ctrl_vars, not_ctrls = determine_ctrl_vars(variable_metadata, ind_var, dep_var, hypotheses) # worst case ctrl: all vars that aren't meant to be ctrld are ctrld num_not_ctrl = len(not_ctrls) @@ -329,12 +315,10 @@ def parse_target_hyp( proposed_dep = target_hyp[1] ind_unobserved = ( - variable_metadata[proposed_ind]["extra"]["sparsity_rate"] - > constants.SPARSITY_FOR_UNOBS + variable_metadata[proposed_ind]["extra"]["sparsity_rate"] > constants.SPARSITY_FOR_UNOBS ) dep_unobserved = ( - variable_metadata[proposed_dep]["extra"]["sparsity_rate"] - > constants.SPARSITY_FOR_UNOBS + variable_metadata[proposed_dep]["extra"]["sparsity_rate"] > constants.SPARSITY_FOR_UNOBS ) # if either are unobserved, we have no evidence that they are not correlated @@ -384,16 +368,11 @@ def are_correlated(var_1, var_2, variable_metadata) -> Optional[bool]: of correlation, returns None. """ if ( - variable_metadata[var_1]["extra"]["sparsity_rate"] - > constants.SPARSITY_FOR_UNOBS - or variable_metadata[var_2]["extra"]["sparsity_rate"] - > constants.SPARSITY_FOR_UNOBS + variable_metadata[var_1]["extra"]["sparsity_rate"] > constants.SPARSITY_FOR_UNOBS + or variable_metadata[var_2]["extra"]["sparsity_rate"] > constants.SPARSITY_FOR_UNOBS ): return None - return ( - var_2 in variable_metadata[var_1]["corrs"] - or var_1 in variable_metadata[var_2]["corrs"] - ) + return var_2 in variable_metadata[var_1]["corrs"] or var_1 in variable_metadata[var_2]["corrs"] def integrate_target_hyp( @@ -436,9 +415,7 @@ def main(args: argparse.Namespace): if not args.balanced_ctrl_vars: jsonl_path = os.path.join(args.jsonl_dir, f"{args.n_samples}.jsonl") else: - jsonl_path = os.path.join( - args.jsonl_dir, f"{args.n_samples}_balanced_ctrl_vars.jsonl" - ) + jsonl_path = os.path.join(args.jsonl_dir, f"{args.n_samples}_balanced_ctrl_vars.jsonl") write_to_jsonl(samples, jsonl_path) diff --git a/evals/elsuite/identifying_variables/scripts/make_plots.py b/evals/elsuite/identifying_variables/scripts/make_plots.py index f29f781492..1dea6aef77 100644 --- a/evals/elsuite/identifying_variables/scripts/make_plots.py +++ b/evals/elsuite/identifying_variables/scripts/make_plots.py @@ -10,9 +10,7 @@ plot_difficulty_bars, plot_solver_bars, ) -from evals.elsuite.identifying_variables.scripts.table_utils import ( - make_main_metric_table, -) +from evals.elsuite.identifying_variables.scripts.table_utils import make_main_metric_table from evals.utils import log_utils NUM_REPEATS = 3 @@ -78,9 +76,7 @@ def handle_cot_double_sampling(sampling_entries, solver): if ( # for chat models we filter like this isinstance(entry["prompt"], list) - and entry["prompt"][-1]["content"].startswith( - "Given the above reasoning" - ) + and entry["prompt"][-1]["content"].startswith("Given the above reasoning") or ( # for base models we need to filter like this isinstance(entry["prompt"], str) @@ -102,9 +98,7 @@ def handle_posthoc_metrics(final_results: Dict, log_path: Path, solver: str): # this is necessary because we originally didnt compute recall in the eval for metric in MAIN_METRICS: if metric not in final_results.keys(): - final_results[metric] = compute_metric_posthoc( - metric, metric_entries, sampling_entries - ) + final_results[metric] = compute_metric_posthoc(metric, metric_entries, sampling_entries) return final_results @@ -130,9 +124,9 @@ def populate_default_results_dict(results_dict, results_dir): results_dict[metric]["raw"][solver][renderer][tree_key].append(value) raw = results_dict[metric]["raw"][solver][renderer][tree_key] results_dict[metric]["mean"][solver][renderer][tree_key] = np.mean(raw) - results_dict[metric]["sem"][solver][renderer][tree_key] = np.std( - raw - ) / np.sqrt(NUM_REPEATS) + results_dict[metric]["sem"][solver][renderer][tree_key] = np.std(raw) / np.sqrt( + NUM_REPEATS + ) for metric in results_dict.keys(): del results_dict[metric]["raw"] return results_dict @@ -158,9 +152,7 @@ def make_default_plots(results_dict: Dict, save_dir: Path): metric_labels = ["Control Variable Retrieval nDCG*", "Control Variable Recall"] fig_heights = [6, 5] - for metric, metric_label, fig_height in tqdm( - zip(metrics, metric_labels, fig_heights) - ): + for metric, metric_label, fig_height in tqdm(zip(metrics, metric_labels, fig_heights)): plot_solver_bars( bar_solvers, baseline_solvers, @@ -186,9 +178,7 @@ def extract_large_results_dict(results_dir: Path) -> Dict: } for bbin in ctrl_nDCG_bins: - results_dict[key][bbin]["raw"].append( - final_results[f"ctrl_nDCG-n_ctrl_vars-{bbin}"] - ) + results_dict[key][bbin]["raw"].append(final_results[f"ctrl_nDCG-n_ctrl_vars-{bbin}"]) for key in results_dict.keys(): for bbin in ctrl_nDCG_bins: mean = np.mean(results_dict[key][bbin]["raw"]) @@ -202,9 +192,7 @@ def extract_large_results_dict(results_dir: Path) -> Dict: def make_large_plot(large_results_dir: Dict, save_dir: Path): ctrl_vars_bins = list(range(0, 9)) - plot_difficulty_bars( - large_results_dir, ctrl_vars_bins, save_dir / "ctrl_nDCG_difficulty.png" - ) + plot_difficulty_bars(large_results_dir, ctrl_vars_bins, save_dir / "ctrl_nDCG_difficulty.png") def np_nan_if_none(input_num): @@ -274,15 +262,10 @@ def count_tokens(results_dir: Path, total) -> Tuple[Dict, pd.DataFrame]: "generation/cot/gpt-4-1106-preview", ] solver_to_eval = { - solver: eval_names[0] if "cot" not in solver else eval_names[1] - for solver in solver_names - } - solver_to_tree = { - solver: False if "cot" not in solver else True for solver in solver_names - } - solver_to_tokens = { - solver: {"input": [], "output": [], "total": []} for solver in solver_names + solver: eval_names[0] if "cot" not in solver else eval_names[1] for solver in solver_names } + solver_to_tree = {solver: False if "cot" not in solver else True for solver in solver_names} + solver_to_tokens = {solver: {"input": [], "output": [], "total": []} for solver in solver_names} total_input = 0 total_output = 0 for log in tqdm(results_dir.glob("*.log"), total=total): @@ -304,15 +287,11 @@ def count_tokens(results_dir: Path, total) -> Tuple[Dict, pd.DataFrame]: and seed == 1 and tree != solver_to_tree[solver] ): - solver_to_tokens[solver]["input"].append( - np_nan_if_none(usage["prompt_tokens"]) - ) + solver_to_tokens[solver]["input"].append(np_nan_if_none(usage["prompt_tokens"])) solver_to_tokens[solver]["output"].append( np_nan_if_none(usage["completion_tokens"]) ) - solver_to_tokens[solver]["total"].append( - np_nan_if_none(usage["total_tokens"]) - ) + solver_to_tokens[solver]["total"].append(np_nan_if_none(usage["total_tokens"])) total_input += zero_if_none(usage["prompt_tokens"]) total_output += zero_if_none(usage["completion_tokens"]) @@ -338,9 +317,7 @@ def make_total_tokens_table(default_total: Dict, large_total: Dict) -> pd.DataFr return total_tokens_df -def make_token_count_tables( - default_results_dir: Path, large_results_dir: Path, save_dir: Path -): +def make_token_count_tables(default_results_dir: Path, large_results_dir: Path, save_dir: Path): default_total_tokens, default_per_sample_tokens_df = count_tokens( default_results_dir, total=222 ) diff --git a/evals/elsuite/identifying_variables/scripts/plotting_utils.py b/evals/elsuite/identifying_variables/scripts/plotting_utils.py index 1c80aab042..1800e6e75c 100644 --- a/evals/elsuite/identifying_variables/scripts/plotting_utils.py +++ b/evals/elsuite/identifying_variables/scripts/plotting_utils.py @@ -1,11 +1,10 @@ -from typing import Dict, Iterable, List from pathlib import Path +from typing import Dict, Iterable, List -import numpy as np import matplotlib.pyplot as plt +import numpy as np import seaborn as sns - renderers_of_interest = ["csv", "language-corrset"] renderer_to_label = { @@ -37,12 +36,8 @@ } cmap = plt.get_cmap("Set2") -bline_colors = np.array( - [cmap(i) for i in range(0, len(baseline_to_linestyle.keys()) + 0)] -) -baseline_to_color = { - key: color for key, color in zip(baseline_to_linestyle.keys(), bline_colors) -} +bline_colors = np.array([cmap(i) for i in range(0, len(baseline_to_linestyle.keys()) + 0)]) +baseline_to_color = {key: color for key, color in zip(baseline_to_linestyle.keys(), bline_colors)} def plot_solver_bars( @@ -74,14 +69,8 @@ def plot_solver_bars( f, ax = plt.subplots(1, 1, dpi=300, figsize=(9, fig_height)) for i, renderer in enumerate(renderers_of_interest): - bars = [ - metric_results["mean"][solver][renderer]["without tree"] - for solver in bar_solvers - ] - errors = [ - metric_results["sem"][solver][renderer]["without tree"] - for solver in bar_solvers - ] + bars = [metric_results["mean"][solver][renderer]["without tree"] for solver in bar_solvers] + errors = [metric_results["sem"][solver][renderer]["without tree"] for solver in bar_solvers] ax.bar( positions + bar_width * i, @@ -101,9 +90,7 @@ def plot_solver_bars( color=baseline_to_color[baseline_solver], linestyle=baseline_to_linestyle[baseline_solver], ) - ax.axhspan( - mean - sem, mean + sem, alpha=0.1, color=baseline_to_color[baseline_solver] - ) + ax.axhspan(mean - sem, mean + sem, alpha=0.1, color=baseline_to_color[baseline_solver]) ax.set_xticks( positions + bar_width / 2, @@ -111,9 +98,7 @@ def plot_solver_bars( rotation=45, ha="right", ) - ax.tick_params( - axis="x", which="both", bottom=True - ) # Show both major and minor xticks + ax.tick_params(axis="x", which="both", bottom=True) # Show both major and minor xticks ax.set_ylabel(metric_label) ax.set_ylim(-0.005, 1) ax.xaxis.grid(False) diff --git a/evals/elsuite/identifying_variables/scripts/table_utils.py b/evals/elsuite/identifying_variables/scripts/table_utils.py index 3991cd469b..e55da0bfb2 100644 --- a/evals/elsuite/identifying_variables/scripts/table_utils.py +++ b/evals/elsuite/identifying_variables/scripts/table_utils.py @@ -1,7 +1,6 @@ -from typing import Dict, List from pathlib import Path +from typing import Dict, List -import numpy as np import pandas as pd @@ -42,16 +41,10 @@ def make_main_metric_table( if value is not None: data_tuples.append((solver, tree_type, renderer, stat, value)) - df = pd.DataFrame( - data_tuples, columns=["Solver", "Tree", "Renderer", "Stat", "Value"] - ) - df = df.pivot_table( - index=["Solver", "Tree"], columns=["Renderer", "Stat"], values="Value" - ) + df = pd.DataFrame(data_tuples, columns=["Solver", "Tree", "Renderer", "Stat", "Value"]) + df = df.pivot_table(index=["Solver", "Tree"], columns=["Renderer", "Stat"], values="Value") # sorting by solvers, renderers (for some reason ordering is lost in the above process) - new_index = [ - (solver, tree) for solver in solvers for tree in ["with tree", "without tree"] - ] + new_index = [(solver, tree) for solver in solvers for tree in ["with tree", "without tree"]] new_columns = pd.MultiIndex.from_product( [renderers, df.columns.levels[1]], names=df.columns.names ) diff --git a/evals/elsuite/make_me_say/core.py b/evals/elsuite/make_me_say/core.py index e1b1d2181c..1317ae8070 100644 --- a/evals/elsuite/make_me_say/core.py +++ b/evals/elsuite/make_me_say/core.py @@ -201,7 +201,7 @@ def view_as(self, player: Player) -> tuple[Message, ...]: def _mentions_codeword(self, s: str) -> bool: lemmatizer = WordNetLemmatizer() - s_lowered = s.lower().replace('"', '').replace("'", '').replace('“', '').replace('”', '') + s_lowered = s.lower().replace('"', "").replace("'", "").replace("“", "").replace("”", "") words = word_tokenize(s_lowered) for codeword in self.word_list: diff --git a/evals/elsuite/skill_acquisition/scraping/scrape_miskito.py b/evals/elsuite/skill_acquisition/scraping/scrape_miskito.py index 697b5667cd..bee0856e76 100644 --- a/evals/elsuite/skill_acquisition/scraping/scrape_miskito.py +++ b/evals/elsuite/skill_acquisition/scraping/scrape_miskito.py @@ -6,7 +6,7 @@ from bs4 import BeautifulSoup from markdownify import markdownify as md -# TODO: make sure italicised text is crawled properly and that hints are excluded from answers. +# TODO: make sure italicised text is crawled properly and that hints are excluded from answers. # TODO: Split any multi-part questions into individual questions. miskito_base_url = "https://en.wikibooks.org/wiki/Miskito/Lesson_{idx}" diff --git a/evals/elsuite/solver_tools_convo.py b/evals/elsuite/solver_tools_convo.py index 8a13adf80b..e052c30629 100644 --- a/evals/elsuite/solver_tools_convo.py +++ b/evals/elsuite/solver_tools_convo.py @@ -172,7 +172,7 @@ def _run_tool_call(self, tool_call: ToolCall) -> ToolCall: out = tool(task_state) except (TypeError, ValueError, IndexError): out = None - + if out is None: return None diff --git a/evals/elsuite/track_the_stat/prompts/__init__.py b/evals/elsuite/track_the_stat/prompts/__init__.py index 31d1b2de39..3d9713aa60 100644 --- a/evals/elsuite/track_the_stat/prompts/__init__.py +++ b/evals/elsuite/track_the_stat/prompts/__init__.py @@ -1,6 +1,4 @@ -from . import mode -from . import median - +from . import median, mode TASK_DESCRIPTION = """\ In the following messages, you will be given a sequence of numbers.\ diff --git a/evals/elsuite/track_the_stat/scripts/make_plots.py b/evals/elsuite/track_the_stat/scripts/make_plots.py index b40e4a3586..0c888946d0 100644 --- a/evals/elsuite/track_the_stat/scripts/make_plots.py +++ b/evals/elsuite/track_the_stat/scripts/make_plots.py @@ -1,11 +1,11 @@ -from pathlib import Path import argparse import json +from pathlib import Path -from tqdm.auto import tqdm -import numpy as np import matplotlib.pyplot as plt +import numpy as np import seaborn as sns +from tqdm.auto import tqdm from evals.utils import log_utils @@ -86,9 +86,7 @@ def fill_results_dict(results_dict, log_dir): model = get_model(spec) state_tracking = get_state_tracking(spec) for stat in results_dict: - results_dict[stat][task][model][state_tracking]["raw"].append( - final_result[stat] - ) + results_dict[stat][task][model][state_tracking]["raw"].append(final_result[stat]) # compute means/std_errs for file in tqdm(files): spec = specs[file] @@ -97,9 +95,7 @@ def fill_results_dict(results_dict, log_dir): state_tracking = get_state_tracking(spec) for stat in results_dict: data_points = results_dict[stat][task][model][state_tracking]["raw"] - results_dict[stat][task][model][state_tracking]["mean"] = np.mean( - data_points - ) + results_dict[stat][task][model][state_tracking]["mean"] = np.mean(data_points) results_dict[stat][task][model][state_tracking]["std_err"] = np.std( data_points ) / np.sqrt(len(data_points) if len(data_points) > 1 else 1) @@ -110,10 +106,7 @@ def prepare_results_dict(): results_dict = { stat: { task: { - model: { - state_tracking: {"raw": []} - for state_tracking in ["implicit", "explicit"] - } + model: {state_tracking: {"raw": []} for state_tracking in ["implicit", "explicit"]} for model in MODELS } for task in ["mode", "median"] @@ -134,13 +127,8 @@ def make_bar_plot(results_dict: dict, task: str, stat: str, save_path: Path): state_tracking_kinds = ["explicit", "implicit"] - means = [ - [data[model][cat]["mean"] for cat in state_tracking_kinds] for model in models - ] - std_errs = [ - [data[model][cat]["std_err"] for cat in state_tracking_kinds] - for model in models - ] + means = [[data[model][cat]["mean"] for cat in state_tracking_kinds] for model in models] + std_errs = [[data[model][cat]["std_err"] for cat in state_tracking_kinds] for model in models] cmap = plt.get_cmap("Paired") colors = np.array([cmap(i) for i in range(len(state_tracking_kinds))]) @@ -171,8 +159,7 @@ def make_bar_plot(results_dict: dict, task: str, stat: str, save_path: Path): ax.set_xlabel(STAT_TO_LABEL[stat]) # maximum x + xerr value times 1.2 x_max = ( - max([m for mean in means for m in mean]) - + max([e for err in std_errs for e in err]) + max([m for mean in means for m in mean]) + max([e for err in std_errs for e in err]) ) * 1.2 ax.set_xlim([0, x_max]) ax.set_yticks(x) @@ -270,7 +257,7 @@ def main(args: argparse.Namespace): results_dict = make_results_dict(log_dir) - for stat in tqdm(results_dict.keys(), desc=f"Plotting..."): + for stat in tqdm(results_dict.keys(), desc="Plotting..."): for task in tqdm(["mode", "median"], desc=f"Plotting {stat}"): save_path = save_dir / f"{task}_{stat}.png" make_bar_plot(results_dict, task, stat, save_path) @@ -286,11 +273,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "--log_dir", type=str, required=True, help="Where the logs are stored" - ) - parser.add_argument( - "--save_dir", type=str, required=True, help="Where to save the plots" - ) + parser.add_argument("--log_dir", type=str, required=True, help="Where the logs are stored") + parser.add_argument("--save_dir", type=str, required=True, help="Where to save the plots") args = parser.parse_args() main(args) diff --git a/evals/elsuite/twenty_questions/scripts/make_plots.py b/evals/elsuite/twenty_questions/scripts/make_plots.py index f07b76da5a..449b0c76e6 100644 --- a/evals/elsuite/twenty_questions/scripts/make_plots.py +++ b/evals/elsuite/twenty_questions/scripts/make_plots.py @@ -1,6 +1,6 @@ import argparse -from pathlib import Path import os +from pathlib import Path import matplotlib.pyplot as plt import pandas as pd @@ -40,16 +40,19 @@ "average_num_gamemaster_refusals": 0.1111, "average_num_incorrect_guesses": 0.8611, "average_word_difficulty": 2.2777, - } + }, } UNIT_METRICS = ["winrate"] + def extract_metrics(datadir: Path) -> pd.DataFrame: df_rows = [] # There are two eval variants: standard and shortlist. for variant in os.listdir(datadir): - for path, results in sorted(list(log_utils.get_final_results_from_dir(f"{datadir}/{variant}").items())): + for path, results in sorted( + list(log_utils.get_final_results_from_dir(f"{datadir}/{variant}").items()) + ): spec = log_utils.extract_spec(path) solver_path = Path(spec["completion_fns"][0]) model = solver_path.name @@ -64,6 +67,7 @@ def extract_metrics(datadir: Path) -> pd.DataFrame: return df + def make_plot(df: pd.DataFrame, outpath: Path, metric="score", variant="standard"): df = df.round(2) plt.figure() @@ -76,22 +80,42 @@ def compute_sem(x): upper = (x.mean() + sem2).round(2) return lower, upper - # Plotting sns.set(style="whitegrid") ax = sns.barplot(x=metric, y="model", hue="solver", data=df, errorbar=compute_sem, capsize=0.1) for container in ax.containers: ax.bar_label(container, fmt="{:.2f}", label_type="edge", padding=15) - + ax.axvline(HUMAN_BASELINE[variant][metric], color="red", linestyle="--") # A bunch of tweaks to make individual plots look nice. if variant == "shortlist" and metric == "winrate": - plt.text(HUMAN_BASELINE[variant][metric] - 0.35, .5, "Human baseline", color="red", fontsize=12, ha="left") + plt.text( + HUMAN_BASELINE[variant][metric] - 0.35, + 0.5, + "Human baseline", + color="red", + fontsize=12, + ha="left", + ) elif variant == "standard" and metric == "average_num_questions": - plt.text(HUMAN_BASELINE[variant][metric] - 7, .5, "Human baseline", color="red", fontsize=12, ha="left") + plt.text( + HUMAN_BASELINE[variant][metric] - 7, + 0.5, + "Human baseline", + color="red", + fontsize=12, + ha="left", + ) else: - plt.text(HUMAN_BASELINE[variant][metric] + 0.05, .5, "Human baseline", color="red", fontsize=12, ha="left") + plt.text( + HUMAN_BASELINE[variant][metric] + 0.05, + 0.5, + "Human baseline", + color="red", + fontsize=12, + ha="left", + ) # Some of the metrics are in [0, 1]. if metric in UNIT_METRICS: @@ -112,6 +136,7 @@ def compute_sem(x): plt.savefig(outpath) plt.close() + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--log-dir", "-d", type=str, required=True) @@ -125,18 +150,27 @@ def compute_sem(x): df = extract_metrics(log_dir) # Rename some of the solver values so they can be represented in the same plot. - df.loc[df['solver'] == 'cot_hhh', 'solver'] = 'cot' - df.loc[df['solver'] == 'hhh', 'solver'] = 'direct' + df.loc[df["solver"] == "cot_hhh", "solver"] = "cot" + df.loc[df["solver"] == "hhh", "solver"] = "direct" - for variant in df['variant'].unique(): - df_per_variant = df[df['variant'] == variant] + for variant in df["variant"].unique(): + df_per_variant = df[df["variant"] == variant] print(f"Plotting all metrics for {variant} variant...") core_metrics = ["score", "winrate"] - auxiliary_metrics = ["average_num_guesses", "average_num_questions", "average_num_violations", "average_num_gamemaster_refusals", "average_num_incorrect_guesses", "average_word_difficulty"] + auxiliary_metrics = [ + "average_num_guesses", + "average_num_questions", + "average_num_violations", + "average_num_gamemaster_refusals", + "average_num_incorrect_guesses", + "average_word_difficulty", + ] for metric in core_metrics + auxiliary_metrics: - make_plot(df_per_variant[["model", "solver", metric]].copy(), - out_dir / f"{variant}_{metric}.png", - metric, - variant) \ No newline at end of file + make_plot( + df_per_variant[["model", "solver", metric]].copy(), + out_dir / f"{variant}_{metric}.png", + metric, + variant, + ) diff --git a/evals/elsuite/twenty_questions/test_utils.py b/evals/elsuite/twenty_questions/test_utils.py index 069b86c8ad..2d1f885aad 100644 --- a/evals/elsuite/twenty_questions/test_utils.py +++ b/evals/elsuite/twenty_questions/test_utils.py @@ -1,12 +1,15 @@ from utils import format_msg, format_msgs + from evals.task_state import Message + def test_format_msg(): msg = Message(content="I'm a message", role="guesser") assert format_msg(msg, "guesser") == Message(content="I'm a message", role="assistant") assert format_msg(msg, "gamemaster") == Message(content="I'm a message", role="user") + def test_format_msgs(): msgs = [ Message(content="I'm a guesser message", role="guesser"), @@ -19,9 +22,9 @@ def test_format_msgs(): Message(content="I'm a gamemaster message", role="user"), Message(content="I'm another guesser message", role="assistant"), ] - + assert format_msgs(msgs, "gamemaster") == [ Message(content="I'm a guesser message", role="user"), Message(content="I'm a gamemaster message", role="assistant"), Message(content="I'm another guesser message", role="user"), - ] \ No newline at end of file + ] diff --git a/evals/elsuite/twenty_questions/utils.py b/evals/elsuite/twenty_questions/utils.py index 169dd0a977..d4776b0b83 100644 --- a/evals/elsuite/twenty_questions/utils.py +++ b/evals/elsuite/twenty_questions/utils.py @@ -1,4 +1,5 @@ -from typing import Literal, List, Optional +from typing import List, Literal, Optional + from evals.task_state import Message, TaskState PROMPTS = { @@ -6,13 +7,26 @@ "guesser_shortlist": "You are about to play the game '20 questions'. The other player has thought of a noun which you should try to guess. The noun is in this list: {shortlist}.\nYou can ask 20 yes/no questions, to which they will answer 'yes', 'no', or 'skip' (if they cannot answer your question). You should try to guess the word as soon as possible, using the least amount of questions. To guess a word, type [GUESS ] – for example to guess the word 'sibling', output [GUESS sibling]. Your score will be 0 if you do not guess correctly, and {max_questions} minus the number of questions you asked if you guess correctly. Start by asking your first question.", "gamemaster": "You are about to play the game '20 questions'. You'll be given a noun, which the other player will try to guess. They will ask 20 yes/no questions to try to guess the word. You will answer their questions with 'yes', 'no', or 'skip'. The word is: {word}.", "incorrect_guess": "Guess {guess} is incorrect. Ask more questions, or make another guess!", - "rule_violation": "Your output was neither a guess nor a question. Try again! You can ask a yes/no question, or make a guess by outputting [GUESS ]." + "rule_violation": "Your output was neither a guess nor a question. Try again! You can ask a yes/no question, or make a guess by outputting [GUESS ].", } -def generate_task_state_for(role: Literal["guesser", "gamemaster"], conversation: list[Message], word: Optional[str] = None, max_questions: int = 20, shortlist: Optional[List[str]] = None) -> TaskState: + +def generate_task_state_for( + role: Literal["guesser", "gamemaster"], + conversation: list[Message], + word: Optional[str] = None, + max_questions: int = 20, + shortlist: Optional[List[str]] = None, +) -> TaskState: """Generates a TaskState for the given role and conversation.""" if role == "guesser": - prompt = PROMPTS["guesser"].format(max_questions=max_questions) if shortlist is None else PROMPTS["guesser_shortlist"].format(max_questions=max_questions, shortlist=shortlist) + prompt = ( + PROMPTS["guesser"].format(max_questions=max_questions) + if shortlist is None + else PROMPTS["guesser_shortlist"].format( + max_questions=max_questions, shortlist=shortlist + ) + ) elif role == "gamemaster": prompt = PROMPTS[role].format(word=word) else: @@ -24,7 +38,7 @@ def generate_task_state_for(role: Literal["guesser", "gamemaster"], conversation task_description=prompt, messages=formatted_conversation, ) - + def format_msgs( messages: list[Message], @@ -39,6 +53,7 @@ def format_msgs( return new_messages + def format_msg(msg: Message, role: Literal["guesser", "gamemaster"]) -> Message: """Formats a single message from the perspective of the `role`.""" @@ -61,9 +76,10 @@ def format_msg(msg: Message, role: Literal["guesser", "gamemaster"]) -> Message: return new_message + def is_system_msg(m: Message) -> bool: assert isinstance(m, Message), "Message must be a Message type." assert hasattr(m, "role"), "Message must have a role." assert isinstance(m.role, str), "Message role must be a string." - return m.role == "system" \ No newline at end of file + return m.role == "system" diff --git a/evals/record.py b/evals/record.py index 8e8ebe9ae6..93fe085513 100644 --- a/evals/record.py +++ b/evals/record.py @@ -366,7 +366,11 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]): def record_final_report(self, final_report: Any): with bf.BlobFile(self.event_file_path, "ab") as f: - f.write((jsondumps({"final_report": final_report, "run_id": self.run_spec.run_id}) + "\n").encode("utf-8")) + f.write( + ( + jsondumps({"final_report": final_report, "run_id": self.run_spec.run_id}) + "\n" + ).encode("utf-8") + ) logging.info(f"Final report: {final_report}. Logged to {self.event_file_path}") diff --git a/evals/solvers/prompts/hhh_test.py b/evals/solvers/prompts/hhh_test.py index 58bb140e54..b73e1073b9 100644 --- a/evals/solvers/prompts/hhh_test.py +++ b/evals/solvers/prompts/hhh_test.py @@ -1,6 +1,23 @@ from evals.solvers.prompts.hhh import HHH_MSGS from evals.task_state import Message +MALDACENA_ABSTRACT = ( + "Recently, it has been proposed by Maldacena that large N limits of certain " + "conformal field theories in d dimensions can be described in terms of " + "supergravity (and string theory) on the product of d+1-dimensional AdS space " + "with a compact manifold. Here we elaborate on this idea and propose a precise " + "correspondence between conformal field theory observables and those of " + "supergravity: correlation functions in conformal field theory are given by the " + "dependence of the supergravity action on the asymptotic behavior at infinity. " + "In particular, dimensions of operators in conformal field theory are given by " + "masses of particles in supergravity. As quantitative confirmation of this " + "correspondence, we note that the Kaluza-Klein modes of Type IIB supergravity on " + "AdS5×S5 match with the chiral operators of N=4 super Yang-Mills theory in four " + "dimensions. With some further assumptions, one can deduce a Hamiltonian version " + "of the correspondence and show that the N=4 theory has a large N phase transition " + "related to the thermodynamics of AdS black holes." +) + # Expected HHH prompt HHH_PROMPT = """Below are a series of dialogues between various people and an AI assistant. The AI tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. The assistant is happy to help with almost anything, and will do its best to understand exactly what is needed. It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer. That said, the assistant is practical and really does its best, and doesn't let caution get too much in the way of being useful. @@ -22,7 +39,7 @@ Human: I found this abstract for a physics paper, and I heard that it's a very famous and important paper, but I had no idea what any of it means: -Recently, it has been proposed by Maldacena that large N limits of certain conformal field theories in d dimensions can be described in terms of supergravity (and string theory) on the product of d+1-dimensional AdS space with a compact manifold. Here we elaborate on this idea and propose a precise correspondence between conformal field theory observables and those of supergravity: correlation functions in conformal field theory are given by the dependence of the supergravity action on the asymptotic behavior at infinity. In particular, dimensions of operators in conformal field theory are given by masses of particles in supergravity. As quantitative confirmation of this correspondence, we note that the Kaluza-Klein modes of Type IIB supergravity on AdS5×S5 match with the chiral operators of N=4 super Yang-Mills theory in four dimensions. With some further assumptions, one can deduce a Hamiltonian version of the correspondence and show that the N=4 theory has a large N phase transition related to the thermodynamics of AdS black holes. +{MALDACENA_ABSTRACT} Can you explain it? diff --git a/evals/solvers/providers/anthropic/anthropic_solver_test.py b/evals/solvers/providers/anthropic/anthropic_solver_test.py index 9ba8fb1470..6bc5772e9d 100644 --- a/evals/solvers/providers/anthropic/anthropic_solver_test.py +++ b/evals/solvers/providers/anthropic/anthropic_solver_test.py @@ -1,14 +1,11 @@ import os + import pytest +from anthropic.types import ContentBlock, MessageParam, Usage from evals.record import DummyRecorder +from evals.solvers.providers.anthropic.anthropic_solver import AnthropicSolver, anth_to_openai_usage from evals.task_state import Message, TaskState -from evals.solvers.providers.anthropic.anthropic_solver import ( - AnthropicSolver, - anth_to_openai_usage, -) - -from anthropic.types import ContentBlock, MessageParam, Usage IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" MODEL_NAME = "claude-instant-1.2" @@ -32,9 +29,7 @@ def dummy_recorder(): yield recorder -@pytest.mark.skipif( - IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit." -) +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") def test_solver(dummy_recorder, anthropic_solver): """ Test that the solver generates a response coherent with the message history @@ -55,9 +50,7 @@ def test_solver(dummy_recorder, anthropic_solver): ) solver_res = solver(task_state=task_state) - assert ( - solver_res.output == answer - ), f"Expected '{answer}', but got {solver_res.output}" + assert solver_res.output == answer, f"Expected '{answer}', but got {solver_res.output}" def test_message_format(): @@ -71,9 +64,7 @@ def test_message_format(): msgs = [ Message(role="user", content="What is 2 + 2?"), Message(role="system", content="reason step by step"), - Message( - role="assistant", content="I don't need to reason for this, 2+2 is just 4" - ), + Message(role="assistant", content="I don't need to reason for this, 2+2 is just 4"), Message(role="system", content="now, given your reasoning, provide the answer"), ] anth_msgs = AnthropicSolver._convert_msgs_to_anthropic_format(msgs) @@ -89,17 +80,13 @@ def test_message_format(): MessageParam( role="assistant", content=[ - ContentBlock( - text="I don't need to reason for this, 2+2 is just 4", type="text" - ), + ContentBlock(text="I don't need to reason for this, 2+2 is just 4", type="text"), ], ), MessageParam( role="user", content=[ - ContentBlock( - text="now, given your reasoning, provide the answer", type="text" - ), + ContentBlock(text="now, given your reasoning, provide the answer", type="text"), ], ), ] @@ -126,6 +113,4 @@ def test_anth_to_openai_usage_zero_tokens(): "prompt_tokens": 0, "total_tokens": 0, } - assert ( - anth_to_openai_usage(usage) == expected - ), "Zero token cases are not handled correctly." + assert anth_to_openai_usage(usage) == expected, "Zero token cases are not handled correctly."