diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 905fa4cb..92b5d4e2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10', 'pypy-3.7', 'pypy-3.8'] + python-version: ['3.8', '3.9', '3.10', 'pypy-3.8', 'pypy-3.10'] fail-fast: false steps: - uses: actions/checkout@v2 diff --git a/amazon/ionbenchmark/benchmark_runner.py b/amazon/ionbenchmark/benchmark_runner.py index 9c99a3d2..8418bec1 100644 --- a/amazon/ionbenchmark/benchmark_runner.py +++ b/amazon/ionbenchmark/benchmark_runner.py @@ -1,17 +1,16 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -""" -This module -""" import gc -import tempfile +import os import platform +import tempfile import time import timeit from amazon.ionbenchmark.benchmark_spec import BenchmarkSpec import amazon.ionbenchmark.Format as _format +from amazon.ionbenchmark.sample_dist import SampleDist _pypy = platform.python_implementation() == 'PyPy' if not _pypy: @@ -21,24 +20,34 @@ class BenchmarkResult: """ Results generated by the `run_benchmark` function. - - * `timings` is a list of numbers representing the number of nanoseconds to complete each iteration - * `batch_size` is the number of times the function was invoked in each iteration - * `peak_memory_usage` is the peak memory allocated during a single run of the benchmark function, in bytes """ - timings = None - batch_size = None - peak_memory_usage = None + nanos_per_op: SampleDist = None + ops_per_second: SampleDist = None + peak_memory_usage = None # measured in bytes - def __init__(self, timings, batch_size, peak_memory_usage): - self.timings = timings - self.batch_size = batch_size + def __init__(self, nanos_per_op, ops_per_second, peak_memory_usage): + self.nanos_per_op = SampleDist(nanos_per_op) + self.ops_per_second = SampleDist(ops_per_second) self.peak_memory_usage = peak_memory_usage def run_benchmark(benchmark_spec: BenchmarkSpec): """ Run benchmarks for `benchmark_spec`. + + The overall approach of this runner is to time multiple samples where each sample consists of multiple invocations + of the test function. As a rule of thumb, a sample size of 30 is the minimum needed to have a useful level of + confidence in the results. The margin of error is (roughly speaking) inversely proportional to the square root of + the sample size, so adding more samples can increase the confidence, but it will have increasingly diminishing + improvements. As a rule of thumb, it's never worth having a sample size greater than 1000. + + This approach is sound because of the Central Limit Theorem. For an approachable introduction, see + https://www.kristakingmath.com/blog/sampling-distribution-of-the-sample-mean. + + The reason for multiple invocations per sample is to prevent very short functions from being dominated by + differences in memory locations or other small differences from one sample to the next. This runner uses the `Timer` + utility's `autorange()` function to determine the number of times the function must be invoked for it to run for + at least 1 second. That number is then used as the number of invocations for _every_ sample in the set. """ test_fun = _create_test_fun(benchmark_spec) @@ -59,11 +68,26 @@ def run_benchmark(benchmark_spec: BenchmarkSpec): # warm up timer.timeit(benchmark_spec.get_warmups()) - # iteration - (batch_size, _) = timer.autorange() - timings = timer.repeat(benchmark_spec.get_iterations(), batch_size) + # TODO: Consider making the target batch time or the batch size configurable instead of using this hack. + if "PYTEST_CURRENT_TEST" in os.environ: + # make the unit tests run in a reasonable time + batch_size = 1 + else: + # range-finding + # This needs the default timer (measuring in seconds) to work correctly, so it's a different Timer instance. + (batch_size, _) = timeit.Timer(stmt=test_fun, setup=setup).autorange() + # Ad hoc testing indicates that samples of 1-2 seconds give tighter results than the default 0.2 seconds, but for + # very quick testing, this can be annoyingly slow. + batch_size *= 5 # ~1-2 seconds + + # sample collection (iterations) + raw_timings = timer.repeat(benchmark_spec.get_iterations(), batch_size) + + # Normalize the samples (i.e. remove the effect of the batch size) before returning the results + nanos_per_op = [t/batch_size for t in raw_timings] + ops_per_sec = [1000000000.0 / t for t in nanos_per_op] - return BenchmarkResult(timings, batch_size, peak_memory_usage) + return BenchmarkResult(nanos_per_op, ops_per_sec, peak_memory_usage) def _create_test_fun(benchmark_spec: BenchmarkSpec): diff --git a/amazon/ionbenchmark/ion_benchmark_cli.py b/amazon/ionbenchmark/ion_benchmark_cli.py index 2d1a0346..02e7c0a3 100644 --- a/amazon/ionbenchmark/ion_benchmark_cli.py +++ b/amazon/ionbenchmark/ion_benchmark_cli.py @@ -37,7 +37,7 @@ from amazon.ionbenchmark.Format import format_is_ion, rewrite_file_to_format from amazon.ionbenchmark.benchmark_runner import run_benchmark -from amazon.ionbenchmark.report import report_stats +from amazon.ionbenchmark.report import report_stats, get_report_field_by_name from amazon.ionbenchmark.benchmark_spec import BenchmarkSpec # Relate pypy incompatible issue - https://github.com/amazon-ion/ion-python/issues/227 @@ -51,7 +51,7 @@ def compare_command(): Compare the results of two benchmarks to determine if has regressed compared to . Usage: - ion_python_benchmark_cli.py compare [-fq][--abc ][--threshold ][--output ][-c ]... + ion_python_benchmark_cli.py compare [-fq][--threshold ][--output ][-c ] Arguments: A report from running a benchmark at some point in the past. @@ -59,7 +59,7 @@ def compare_command(): A new report to compare against Options: - -c , --compare A field to compare in the reports. [default: file_size(B) time_min(ns)] + -c , --compare A comma separated list of fields to compare in the reports. [default: file_size,time_mean] -o --output PATH File to write the regression report. -q --quiet Suppress writing regressions to std out. [default: False] -t , --threshold Margin of error for comparison. [default: 0.20] @@ -70,7 +70,13 @@ def compare_command(): current_path = args[''] output_file_for_comparison = args['--output'] regression_threshold = float(args['--threshold']) - comparison_keywords = args['--compare'] + comparison_keywords_arg = args['--compare'] + + # TODO: Update this command to use the information in REPORT_FIELDS, such as the direction of improvement (doi). + # https://github.com/amazon-ion/ion-python/issues/281 + # Without that (i.e. right now), the compare command will actually fail when the ops/sec metric improves. :S + + comparison_fields = [get_report_field_by_name(name) for name in comparison_keywords_arg.split(",")] with open(previous_path, 'br') as p, open(current_path, 'br') as c: previous_results = ion.load(p) @@ -81,18 +87,22 @@ def compare_command(): # For results of each configuration pattern with the same file for idx, prev_result in enumerate(previous_results): cur_result = current_results[idx] - name = cur_result['name'] - result = {'name': name} - for keyword in comparison_keywords: - cur = float(cur_result[keyword]) - prev = float(prev_result[keyword]) + case_name = cur_result['name'] + result = {'name': case_name} + for field in comparison_fields: + if field.units is not None: + key = f"{field.name}({field.units})" + else: + key = field.name + cur = float(cur_result[key]) + prev = float(prev_result[key]) relative_diff = (cur - prev) / prev pct_diff = f"{relative_diff:.2%}" - result[keyword] = pct_diff + result[key] = pct_diff if relative_diff > regression_threshold: if not args['--quiet']: - print(f"{name} '{keyword}' changed by {pct_diff}: {prev} => {cur}") + print(f"{case_name} '{key}' changed by {pct_diff}: {prev} => {cur}") has_regression = True report.append(result) @@ -207,7 +217,7 @@ def run_spec_command(): -o --output FILE Destination to store the report. If unset, prints to std out. - -r --report FIELDS Comma-separated list of fields to include in the report. [default: file_size, time_min, time_mean, memory_usage_peak] + -r --report FIELDS Comma-separated list of fields to include in the report. [default: file_size,ops/s_mean,ops/s_error,memory_usage_peak] Example: ./ion_python_benchmark_cli.py run my_spec_file.ion -d '{iterations:1000}' -o '{warmups:0}' -r "time_min, file_size, peak_memory_usage" @@ -284,7 +294,15 @@ def _run_benchmarks(specs: list, report_fields, output_file): result_stats = report_stats(benchmark_spec, result, report_fields) report.append(result_stats) - print(tabulate(report, tablefmt='fancy_grid', headers='keys')) + # TODO: Add some option to dump or otherwise expose the raw sample data. For now, you can + # uncomment the following lines to get the raw results as CSV that can be copy/pasted into a spreadsheet. + # + # printable_key = benchmark_spec.get_name().replace(" ", "").replace(",", "-") + # for _x in [printable_key, *result.timings]: + # print(_x, end=",") + # print("") + + print(tabulate(report, tablefmt='pipe', headers='keys', floatfmt='.2f')) if output_file: des_dir = os.path.dirname(output_file) diff --git a/amazon/ionbenchmark/ion_load_dump.py b/amazon/ionbenchmark/ion_load_dump.py index b33c837d..c56486e0 100644 --- a/amazon/ionbenchmark/ion_load_dump.py +++ b/amazon/ionbenchmark/ion_load_dump.py @@ -15,7 +15,9 @@ class IonLoadDump: def __init__(self, binary, c_ext=True): self._binary = binary self._single_value = False - self._c_ext = c_ext + # Need an explicit check here because if `None` is passed in as an argument, that is different from no argument, + # and results in an unexpected behavior. + self._c_ext = c_ext if c_ext is not None else True def loads(self, s): ion.c_ext = self._c_ext diff --git a/amazon/ionbenchmark/report.py b/amazon/ionbenchmark/report.py index f176b4f3..87b2fcf4 100644 --- a/amazon/ionbenchmark/report.py +++ b/amazon/ionbenchmark/report.py @@ -1,14 +1,74 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -import os -import statistics -from math import ceil + +from collections.abc import Callable + +from dataclasses import dataclass from amazon.ionbenchmark.benchmark_runner import BenchmarkResult from amazon.ionbenchmark.benchmark_spec import BenchmarkSpec -def report_stats(benchmark_spec: BenchmarkSpec, benchmark_result: BenchmarkResult, report_fields: list = None): +@dataclass(frozen=True) +class ReportField: + """ + Represents a field that can be included in the benchmark test report + """ + name: str + compute_fn: Callable # Callable[[BenchmarkSpec, BenchmarkResult], Any] + units: str = None + # Direction of improvement, used by the compare command + doi: int = None + + +REPORT_FIELDS = [ + # TODO: Make sure we have the fields we need to perform a statistically meaningful comparison + # I.e. if we end up needing to use ANOVA or Independent Samples T Test, do we have the fields we need? + ReportField(name="format", + compute_fn=lambda spec, _: spec.get_format()), + ReportField(name="input_file", + compute_fn=lambda spec, _: spec.get_input_file()), + ReportField(name="operation", + compute_fn=lambda spec, _: spec.derive_operation_name()), + ReportField(name="file_size", units="B", doi=-1, + compute_fn=lambda spec, _: spec.get_input_file_size()), + ReportField(name="memory_usage_peak", units="B", doi=-1, + compute_fn=lambda _, result: result.peak_memory_usage), + ReportField(name="time_mean", units="ns", doi=-1, + compute_fn=lambda _, result: result.nanos_per_op.mean), + ReportField(name="time_min", units="ns", doi=-1, + compute_fn=lambda _, result: result.nanos_per_op.min), + ReportField(name="time_max", units="ns", doi=-1, + compute_fn=lambda _, result: result.nanos_per_op.max), + ReportField(name="time_sd", units="ns", + compute_fn=lambda _, result: result.nanos_per_op.stdev), + ReportField(name="time_rsd", units="%", + compute_fn=lambda _, result: result.nanos_per_op.rstdev * 100), + ReportField(name="time_error", units="ns", + compute_fn=lambda _, result: result.nanos_per_op.margin_of_error(confidence=0.999)), + ReportField(name="ops/s_mean", doi=+1, + compute_fn=lambda _, result: result.ops_per_second.mean), + ReportField(name="ops/s_min", doi=+1, + compute_fn=lambda _, result: result.ops_per_second.min), + ReportField(name="ops/s_max", doi=+1, + compute_fn=lambda _, result: result.ops_per_second.max), + ReportField(name="ops/s_sd", + compute_fn=lambda _, result: result.ops_per_second.stdev), + ReportField(name="ops/s_rsd", units="%", + compute_fn=lambda _, result: result.ops_per_second.rstdev * 100), + ReportField(name="ops/s_error", + compute_fn=lambda _, result: result.ops_per_second.margin_of_error(confidence=0.999)), +] + + +def get_report_field_by_name(name: str): + for field in REPORT_FIELDS: + if name == field.name: + return field + raise ValueError(f"Not a valid report field: {name}") + + +def report_stats(benchmark_spec: BenchmarkSpec, benchmark_result: BenchmarkResult, report_fields: list): """ Generate a report for the outcome of a running a benchmark. @@ -19,69 +79,25 @@ def report_stats(benchmark_spec: BenchmarkSpec, benchmark_result: BenchmarkResul * `input_file` – the file used for this benchmark * `format` – the format used for this benchmark * `memory_usage_peak` – the peak amount of memory allocated while running the benchmark function - * `time_` – time statistic for the benchmark; `` can be `mean`, `min`, `max`, `median`, or `p` where - `` is any number from 0 to 100 inclusive. - * `rate_` – throughput statistic for the benchmark; `` can be `mean`, `min`, `max`, `median`, or `p` - where `` is any number from 0 to 100 inclusive. + * `time_` – time statistic for the benchmark + * `ops/s_` – number of operations (invocations of the benchmark function) per second + + `` can be `mean`, `min`, `max`, `median`, `error`, `stdev`, or `rstdev` :param benchmark_spec: The spec for the benchmark that was run :param benchmark_result: The output from the benchmark :param report_fields: list[str] of fields to include in the report. :return: """ - if report_fields is None: - report_fields = ['file_size', 'time_min', 'time_mean', 'memory_usage_peak'] result = {'name': benchmark_spec.get_name()} - for field in report_fields: - if isinstance(field, str) and field.startswith("time_"): - # Note–we use `field[len("time_"):]` instead of `removeprefix("time_")` to support python 3.7 and 3.8 - stat_value = _calculate_timing_stat(field[len("time_"):], benchmark_result.timings, benchmark_result.batch_size) - result[f'{field}(ns)'] = stat_value - elif isinstance(field, str) and field.startswith("rate_"): - timing_value = _calculate_timing_stat(field[len("rate_"):], benchmark_result.timings, benchmark_result.batch_size) - stat_value = ceil(benchmark_spec.get_input_file_size() * 1024 / (timing_value / benchmark_result.batch_size / 1000000000)) - result[f'{field}(kB/s)'] = stat_value - elif field == 'format': - result['format'] = benchmark_spec.get_format() - elif field == 'input_file': - result['input_file'] = os.path.basename(benchmark_spec.get_input_file()) - elif field == 'operation': - result['operation'] = benchmark_spec.derive_operation_name() - elif field == 'file_size': - result['file_size(B)'] = benchmark_spec.get_input_file_size() - elif field == 'memory_usage_peak': - result['memory_usage_peak(B)'] = benchmark_result.peak_memory_usage - elif field == 'name': - pass + for field_name in report_fields: + field = get_report_field_by_name(field_name) + if field.units is not None: + key = f"{field.name}({field.units})" else: - raise ValueError(f"Unrecognized report field '{field}'") + key = field.name + result[key] = field.compute_fn(benchmark_spec, benchmark_result) return result - - -def _calculate_timing_stat(stat: str, timings, batch_size): - """ - Calculate a statistic for the given timings. - - :param stat: Name of a statistic. Can be `min`, `max`, `median`, `mean`, or `p` where `N` is 0 to 100 exclusive. - :param timings: List of result times from running the benchmark function. - :param batch_size: Number of times the benchmark function was invoked to produce a single timing result. - :return: - """ - if stat.startswith("p"): - n = int(stat[1:]) - x = ceil(statistics.quantiles(timings, n=100, method='inclusive')[n-1]/batch_size) - elif stat == 'mean': - x = ceil(sum(timings) / (batch_size * len(timings))) - elif stat == 'min': - x = ceil(min(timings) / batch_size) - elif stat == 'max': - x = ceil(max(timings) / batch_size) - elif stat == 'median': - x = ceil(statistics.median(timings) / batch_size) - else: - raise ValueError(f"Unrecognized statistic {stat}") - return x - diff --git a/amazon/ionbenchmark/sample_dist.py b/amazon/ionbenchmark/sample_dist.py new file mode 100644 index 00000000..30091cbf --- /dev/null +++ b/amazon/ionbenchmark/sample_dist.py @@ -0,0 +1,60 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import statistics + +# singleton NormalDist that is the _unit normal distribution_, used for getting z-scores for confidence levels +_unit_normal = statistics.NormalDist() + + +class SampleDist(list): + """ + Represents a normal-ish sample distribution. Similar to [NormalDist], but retains the source data. + """ + + def __init__(self, data: list): + super().__init__(data) + self.__max = max(data) + self.__min = min(data) + self.__mean = statistics.fmean(data) # This runs faster than the mean() function, and it always returns a float + self.__stdev = statistics.stdev(data) + self.__variance = statistics.variance(data, self.__mean) + + @property + def min(self): + """Return the min value in the sample set.""" + return self.__min + + @property + def max(self): + """Return the max value in the sample set.""" + return self.__max + + @property + def mean(self): + """Return the mean of the sample distribution.""" + return self.__mean + + @property + def variance(self): + """Return the variance of the sample distribution.""" + return self.__variance + + @property + # Sometimes also spelled "stddev", but Python Std Lib statistic module uses "stdev". + def stdev(self): + """Return the standard deviation of the sample distribution.""" + return self.__stdev + + @property + def rstdev(self): + """Return the standard deviation of the sample distribution as a ratio relative to the sample mean""" + return self.__stdev / self.__mean + + def margin_of_error(self, confidence: float): + """ + Return the margin of error of the mean of this sample distribution at the given confidence level. Can be + combined with the mean to express the confidence interval as `mean ± error`. + """ + z_score = _unit_normal.inv_cdf((1 + confidence) / 2.) + return z_score * self.stdev / (len(self) ** .5) diff --git a/tests/test_benchmark_cli.py b/tests/test_benchmark_cli.py index 9f0b06a2..1c5310d9 100644 --- a/tests/test_benchmark_cli.py +++ b/tests/test_benchmark_cli.py @@ -66,18 +66,17 @@ def test_option_read_no_c_extension(file=generate_test_path('integers.ion')): def test_option_read_iterations(file=generate_test_path('integers.ion')): # This is a potentially flaky test due to the overhead of running the CLI as a new process. start = time.perf_counter() - (error_code, _, _) = run_cli(['read', file, '--iterations', '1']) + (error_code, _, _) = run_cli(['read', file, '--iterations', '3']) stop = time.perf_counter() assert not error_code time_1 = stop - start start = time.perf_counter() - (error_code, _, _) = run_cli(['read', file, '--iterations', '10000']) + (error_code, _, _) = run_cli(['read', file, '--iterations', '300']) stop = time.perf_counter() assert not error_code time_2 = stop - start - # Executing 10000 times should be longer than benchmark only once, but don't have to be exact 100x faster. assert time_2 > time_1