Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: pre-commit

on:
pull_request:
push:
branches: [main]

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- uses: actions/setup-python@v2
with:
python-version: 3.9

- uses: pre-commit/action@v3.0.1
1 change: 1 addition & 0 deletions .github/workflows/run_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:

jobs:
check_files:
name: Unit tests
runs-on: ubuntu-latest

steps:
Expand Down
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ repos:
rev: 'v1.3.0'
hooks:
- id: mypy
additional_dependencies:
- typed-ast
args: ["--config-file=mypy.ini", "--no-site-packages"]

- repo: https://github.com/psf/black
Expand Down
8 changes: 5 additions & 3 deletions evals/cli/oaieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging
import shlex
import sys
from typing import Any, Mapping, Optional, Union, cast
from typing import Any, Optional, Union, cast

import evals
import evals.api
Expand Down Expand Up @@ -135,7 +135,7 @@ def run(args: OaiEvalArguments, registry: Optional[Registry] = None) -> str:

def parse_extra_eval_params(
param_str: Optional[str],
) -> Mapping[str, Union[str, int, float]]:
) -> dict[str, Union[str, int, float]]:
"""Parse a string of the form "key1=value1,key2=value2" into a dict."""
if not param_str:
return {}
Expand Down Expand Up @@ -227,7 +227,9 @@ def to_number(x: str) -> Union[int, float, str]:
try:
add_token_usage_to_result(result, recorder)
except Exception as e:
logger.error(f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected.")
logger.error(
f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected."
)
recorder.record_final_report(result)

if not (args.dry_run or args.local_run):
Expand Down
10 changes: 3 additions & 7 deletions evals/elsuite/already_said_that/scripts/gen_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import json
import os
import random
import json

import nltk
from nltk.corpus import wordnet
Expand Down Expand Up @@ -60,13 +60,9 @@ def main(args: argparse.Namespace):
parser = argparse.ArgumentParser()

parser.add_argument("--n_samples", type=int, default=500)
parser.add_argument(
"--n_words", type=int, default=100, help="Number of words in each sample"
)
parser.add_argument("--n_words", type=int, default=100, help="Number of words in each sample")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
"--jsonl_dir", type=str, default="./evals/registry/data/already_said_that/"
)
parser.add_argument("--jsonl_dir", type=str, default="./evals/registry/data/already_said_that/")

args = parser.parse_args()

Expand Down
40 changes: 13 additions & 27 deletions evals/elsuite/already_said_that/scripts/make_plots.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from pathlib import Path
import argparse
import json
from pathlib import Path

from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tqdm.auto import tqdm

from evals.utils import log_utils

Expand Down Expand Up @@ -98,9 +98,7 @@ def make_results_dict(log_dir: Path) -> dict:
def prepare_results_dict() -> dict:
results_dict = {
stat: {
distractor: {
model: {"raw": [], "mean": 0, "std_err": 0} for model in MODELS
}
distractor: {model: {"raw": [], "mean": 0, "std_err": 0} for model in MODELS}
for distractor in DISTRACTORS
}
for stat in [
Expand Down Expand Up @@ -136,9 +134,9 @@ def fill_results_dict(results_dict: dict, log_dir: Path) -> dict:
for stat in results_dict:
data_points = results_dict[stat][distractor][model]["raw"]
results_dict[stat][distractor][model]["mean"] = np.mean(data_points)
results_dict[stat][distractor][model]["std_err"] = np.std(
data_points
) / np.sqrt(NUM_REPEATS)
results_dict[stat][distractor][model]["std_err"] = np.std(data_points) / np.sqrt(
NUM_REPEATS
)
return results_dict


Expand Down Expand Up @@ -205,9 +203,7 @@ def make_bar_plot(results_dict: dict, stat: str, save_path: Path):
legend_indices = list(range(len(distractors)))[::-1]

means = [[data[dis][model]["mean"] for dis in distractors] for model in models]
std_errs = [
[data[dis][model]["std_err"] for dis in distractors] for model in models
]
std_errs = [[data[dis][model]["std_err"] for dis in distractors] for model in models]
cmap = plt.get_cmap("Set3")
colors = np.array([cmap(i) for i in range(len(distractors))])

Expand Down Expand Up @@ -282,15 +278,9 @@ def count_tokens(log_dir) -> dict[str, dict[str, dict[str, int]]]:
samplings = log_utils.extract_individual_results(log, "sampling")
for sampling in samplings:
usage = sampling["usage"]
token_counts[model][distractor]["input"] += zero_if_none(
usage["prompt_tokens"]
)
token_counts[model][distractor]["output"] += zero_if_none(
usage["completion_tokens"]
)
token_counts[model][distractor]["total"] += zero_if_none(
usage["total_tokens"]
)
token_counts[model][distractor]["input"] += zero_if_none(usage["prompt_tokens"])
token_counts[model][distractor]["output"] += zero_if_none(usage["completion_tokens"])
token_counts[model][distractor]["total"] += zero_if_none(usage["total_tokens"])
return token_counts


Expand Down Expand Up @@ -318,11 +308,7 @@ def main(args: argparse.Namespace):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--log_dir", type=str, required=True, help="Where the logs are stored"
)
parser.add_argument(
"--save_dir", type=str, required=True, help="Where to save the plots"
)
parser.add_argument("--log_dir", type=str, required=True, help="Where the logs are stored")
parser.add_argument("--save_dir", type=str, required=True, help="Where to save the plots")
args = parser.parse_args()
main(args)
4 changes: 3 additions & 1 deletion evals/elsuite/cant_do_that_anymore/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ def run(self, recorder: RecorderBase) -> dict[str, Union[float, int]]:
std_num_previous_moves = np.std([i["num_previous_moves"] for i in metrics])

delta = predicted_move_in_variant_proportion - predicted_move_proportion
variant_impact_factor = (delta / predicted_move_proportion) if predicted_move_proportion != 0 else 0
variant_impact_factor = (
(delta / predicted_move_proportion) if predicted_move_proportion != 0 else 0
)

results = {
"variant_impact_factor": variant_impact_factor,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ def get_func_from_code(code):


def get_complexity(code: str) -> int:
# NOTE: this is quite ugly, but should be good enough for dataset-creating code
code = "global func_name\ndef func_name(x): return " + code
exec(code)
return len(list(dis.get_instructions(func_name)))
# NOTE: ugly but good enough for dataset-creating code
src = f"def _tmp(x): return {code}"
ns: dict[str, object] = {}
exec(src, {"math": math}, ns) # define function into ns
fn = ns["_tmp"] # retrieve it
return len(list(dis.get_instructions(fn)))


def create_dataset(out_file, in_file):
Expand Down
16 changes: 0 additions & 16 deletions evals/elsuite/function_deduction/scripts/make_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,19 @@
"Average Baseline": "blue",
"Full Knowledge Best": "blue",
"Full Knowledge Random": "blue",

"Human": "steelblue",

"gpt-4-32k": "purple",
"gpt-4-32k w CoT": "purple",

"gpt-4-base w Few-shot": "orange",
"gpt-4-base w CoT and Few-shot": "orange",

"gpt-3.5-turbo-16k": "green",
"gpt-3.5-turbo-16k w CoT": "green",

"gemini-pro": "peru",
"gemini-pro w CoT": "peru",

"llama-2-13b-chat": "brown",
"llama-2-13b-chat w CoT": "brown",

"llama-2-70b-chat": "maroon",
"llama-2-70b-chat w CoT": "maroon",

"mixtral-8x7b-instruct": "grey",
"mixtral-8x7b-instruct w CoT": "grey",
}
Expand All @@ -40,27 +32,19 @@
"function_deduction/full_knowledge_best": "Full Knowledge Best",
"function_deduction/full_knowledge_random": "Full Knowledge Random",
"function_deduction/average_baseline": "Average Baseline",

"human_cli": "Human",

"gpt-4-32k": "gpt-4-32k",
"function_deduction/cot/gpt-4-32k": "gpt-4-32k w CoT",

"function_deduction/gpt-4-base": "gpt-4-base w Few-shot",
"function_deduction/cot/gpt-4-base": "gpt-4-base w CoT and Few-shot",

"gpt-3.5-turbo-16k": "gpt-3.5-turbo-16k",
"function_deduction/cot/gpt-3.5-turbo-16k": "gpt-3.5-turbo-16k w CoT",

"generation/direct/gemini-pro": "gemini-pro",
"function_deduction/cot/gemini-pro": "gemini-pro w CoT",

"generation/direct/llama-2-13b-chat": "llama-2-13b-chat",
"function_deduction/cot/llama-2-13b-chat": "llama-2-13b-chat w CoT",

"generation/direct/llama-2-70b-chat": "llama-2-70b-chat",
"function_deduction/cot/llama-2-70b-chat": "llama-2-70b-chat w CoT",

"generation/direct/mixtral-8x7b-instruct": "mixtral-8x7b-instruct",
"function_deduction/cot/mixtral-8x7b-instruct": "mixtral-8x7b-instruct w CoT",
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Agent:
def __init__(self, env):
# Set the action space seed so sampling from it is deterministic
env.action_space.seed(episode)

self.env = env

def act(self, observation):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
import logging
import shutil
from functools import cache
from pathlib import Path
from tempfile import TemporaryDirectory

import pandas as pd
from torchvision import datasets

import evals.elsuite.hr_ml_agent_bench.benchmarks.cifar10.env.train as baseline_script
from evals.elsuite.hr_ml_agent_bench.low_level_actions import execute_script
from evals.elsuite.hr_ml_agent_bench.utils import get_baseline_score


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -54,7 +49,7 @@ def get_naive_baseline_score() -> float:
Executes the baseline script `train.py` and returns the accuracy.
Expects the predictions to be saved to `submission.csv` when run.
"""

scripts_dir = Path(__file__).parent
env_dir = scripts_dir.parent / "env"
naive_baseline = env_dir / "train.py"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Agent:
def __init__(self, env):
# Set the action space seed so sampling from it is deterministic
env.action_space.seed(episode)

self.env = env

def act(self, observation):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Agent:
def __init__(self, env):
# Set the action space seed so sampling from it is deterministic
env.action_space.seed(episode)

self.env = env

def act(self, observation):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ def get_naive_baseline_score() -> float:
"""

env_dir = Path(__file__).parent / ".." / "env"
dataset_dir = get_root_dir() / "registry" / "data" / "hr_ml_agent_bench" / "ogbn_arxiv" / "dataset"
dataset_dir = (
get_root_dir() / "registry" / "data" / "hr_ml_agent_bench" / "ogbn_arxiv" / "dataset"
)

with TemporaryDirectory() as tmp_dir:
dst_dir = Path(tmp_dir) / "env"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Agent:
def __init__(self, env):
# Set the action space seed so sampling from it is deterministic
env.action_space.seed(episode)

self.env = env

def act(self, observation):
Expand Down
14 changes: 6 additions & 8 deletions evals/elsuite/hr_ml_agent_bench/high_level_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def edit_script(
# TODO: handle long file editing
try:
content = read_file(script_name, work_dir=work_dir, **kwargs)
except:
except EnvException:
write_file(script_name, "", work_dir=work_dir, **kwargs)
content = ""

Expand Down Expand Up @@ -135,12 +135,12 @@ def edit_script_lines(
try:
start_line_number = int(start_line_number)
end_line_number = int(end_line_number)
except:
except (TypeError, ValueError):
raise EnvException("start_line_number and end_line_number must be integers")

try:
orig_content = read_file(script_name, work_dir=work_dir, **kwargs)
except:
except EnvException:
write_file(script_name, "", work_dir=work_dir, **kwargs)
orig_content = ""
lines = orig_content.split("\n")
Expand Down Expand Up @@ -197,14 +197,12 @@ def inspect_script_lines(script_name, start_line_number, end_line_number, work_d
try:
start_line_number = int(start_line_number)
end_line_number = int(end_line_number)
except:
except (TypeError, ValueError):
raise EnvException("start_line_number and end_line_number must be integers")
if end_line_number - start_line_number > 100:
raise EnvException("the number of lines to display is limited to 100 lines")

try:
# lines = open(os.path.join(work_dir,script_name)).readlines()
lines = read_file(script_name, work_dir=work_dir, **kwargs).split("\n")
except:
except EnvException:
raise EnvException(f"cannot find script {script_name}")

content = "\n".join(lines[max(int(start_line_number) - 1, 0) : int(end_line_number)])
Expand Down
Loading