Set up calibration executables for load and provisioning data gatheri…

…ng (#195) * Move Redshift-focused query runner calibration script into the main source tree * Generalize the load tool, reorganize dirs too * Add workload definitions * Add in gathering configs * Fixes * Avoid pickling issues * Fix metrics formatting * Include index * More fixes * Adjust experiment size * Fix lint
mitdbg · Jul 15, 2023 · d322474 · d322474
1 parent 35b0a78
commit d322474
Show file tree

Hide file tree

Showing 11 changed files with 798 additions and 0 deletions.
diff --git a/src/brad/calibration/__init__.py b/src/brad/calibration/__init__.py
diff --git a/src/brad/calibration/load/__init__.py b/src/brad/calibration/load/__init__.py
diff --git a/src/brad/calibration/load/metrics.py b/src/brad/calibration/load/metrics.py
@@ -0,0 +1,74 @@
+CLOUDWATCH_LOAD_METRICS = [
+ ("CPUUtilization", "Minimum"),
+ ("CPUUtilization", "Maximum"),
+ ("CPUUtilization", "Average"),
+ ("ReadIOPS", "Minimum"),
+ ("ReadIOPS", "Maximum"),
+ ("ReadIOPS", "Average"),
+ ("WriteIOPS", "Minimum"),
+ ("WriteIOPS", "Maximum"),
+ ("WriteIOPS", "Average"),
+ ("ReadThroughput", "Minimum"),
+ ("ReadThroughput", "Maximum"),
+ ("ReadThroughput", "Average"),
+ ("WriteThroughput", "Minimum"),
+ ("WriteThroughput", "Maximum"),
+ ("WriteThroughput", "Average"),
+ ("ReadLatency", "Minimum"),
+ ("ReadLatency", "Maximum"),
+ ("ReadLatency", "Average"),
+ ("WriteLatency", "Minimum"),
+ ("WriteLatency", "Maximum"),
+ ("WriteLatency", "Average"),
+]
+
+PERF_INSIGHTS_LOAD_METRICS = [
+ "os.loadAverageMinute.one",
+ "os.loadAverageMinute.five",
+ "os.loadAverageMinute.fifteen",
+ "os.cpuUtilization.system",
+ "os.cpuUtilization.total",
+ "os.cpuUtilization.user",
+ "os.diskIO.avgQueueLen",
+ "os.diskIO.tps",
+ "os.diskIO.util",
+ "os.diskIO.readIOsPS",
+ "os.diskIO.readKbPS",
+ "os.diskIO.writeIOsPS",
+ "os.diskIO.writeKbPS",
+ "os.network.rx",
+ "os.network.tx",
+ "os.memory.active",
+ "os.memory.dirty",
+ "os.memory.free",
+ "os.memory.writeback",
+ "os.memory.total",
+ "os.tasks.blocked",
+ "os.tasks.running",
+ "os.tasks.sleeping",
+ "os.tasks.stopped",
+ "os.tasks.total",
+ "db.SQL.queries",
+ "db.SQL.total_query_time",
+ "db.SQL.tup_deleted",
+ "db.SQL.tup_fetched",
+ "db.SQL.tup_inserted",
+ "db.SQL.tup_returned",
+ "db.SQL.tup_updated",
+ "db.Transactions.active_transactions",
+ "db.Transactions.blocked_transactions",
+ "db.Transactions.duration_commits",
+ "db.Transactions.xact_commit",
+ "db.Transactions.xact_rollback",
+ # NOTE: Aurora has specific storage metrics (probably because they use a custom storage engine)
+ # https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/USER_PerfInsights_Counters.html#USER_PerfInsights_Counters.Aurora_PostgreSQL
+ "os.diskIO.auroraStorage.auroraStorageBytesRx",
+ "os.diskIO.auroraStorage.auroraStorageBytesTx",
+ "os.diskIO.auroraStorage.diskQueueDepth",
+ "os.diskIO.auroraStorage.readThroughput",
+ "os.diskIO.auroraStorage.writeThroughput",
+ "os.diskIO.auroraStorage.readLatency",
+ "os.diskIO.auroraStorage.writeLatency",
+ "os.diskIO.auroraStorage.readIOsPS",
+ "os.diskIO.auroraStorage.writeIOsPS",
+]
diff --git a/src/brad/calibration/load/query_runner.py b/src/brad/calibration/load/query_runner.py
@@ -0,0 +1,154 @@
+import pathlib
+import random
+import sys
+import time
+import queue
+import multiprocessing as mp
+import io
+import signal
+from typing import Callable
+
+from brad.config.engine import Engine
+from brad.config.file import ConfigFile
+from brad.connection.connection import Cursor
+from brad.server.engine_connections import EngineConnections
+
+
+class Options:
+ def __init__(
+ self,
+ worker_idx: int,
+ output_file: pathlib.Path,
+ config: ConfigFile,
+ engine: Engine,
+ schema_name: str,
+ ) -> None:
+ self.worker_idx = worker_idx
+ self.config = config
+ self.engine = engine
+ self.schema_name = schema_name
+
+ # Set to True if running on Redshift
+ self.disable_redshift_cache = False
+ self.output_file = output_file
+
+ self.avg_gap_s = 1.0
+ self.std_gap_s = 0.5
+ self.seed = 42
+
+
+class Context:
+ def __init__(
+ self,
+ cursor: Cursor,
+ output_file: io.TextIOBase,
+ prng: random.Random,
+ options: Options,
+ ) -> None:
+ self.cursor = cursor
+ self.output_file = output_file
+ self.prng = prng
+ self.options = options
+
+
+RunQueryCallable = Callable[[Context], None]
+
+
+def run_specific_query_until_signalled(
+ query_idx: int,
+ query: str,
+ options: Options,
+ start_queue: mp.Queue,
+ stop_queue: mp.Queue,
+) -> None:
+ runner = get_run_specific_query(query_idx, query)
+ run_until_signalled(runner, options, start_queue, stop_queue)
+
+
+def run_until_signalled(
+ run_query: RunQueryCallable,
+ options: Options,
+ start_queue: mp.Queue,
+ stop_queue: mp.Queue,
+) -> None:
+ """
+ Meant to be launched as a subprocess with multiprocessing.
+ """
+
+ def noop_handler(_signal, _frame):
+ pass
+
+ signal.signal(signal.SIGINT, noop_handler)
+
+ ec = EngineConnections.connect_sync(
+ options.config,
+ options.schema_name,
+ autocommit=True,
+ specific_engines={options.engine},
+ )
+
+ try:
+ conn = ec.get_connection(options.engine)
+ cursor = conn.cursor_sync()
+
+ # Hacky way to disable the query cache when applicable.
+ if options.disable_redshift_cache:
+ print(
+ "Disabling Redshift result cache (client {})".format(options.worker_idx)
+ )
+ cursor.execute_sync("SET enable_result_cache_for_session = OFF;")
+
+ prng = random.Random(options.seed ^ options.worker_idx)
+
+ with open(options.output_file, "w", encoding="UTF-8") as file:
+ ctx = Context(cursor, file, prng, options)
+ print("query_idx,run_time_s", file=file, flush=True)
+
+ # Signal that we're ready to start and wait for the controller.
+ start_queue.put_nowait("")
+ _ = stop_queue.get()
+
+ while True:
+ run_query(ctx)
+ try:
+ _ = stop_queue.get_nowait()
+ break
+ except queue.Empty:
+ pass
+ finally:
+ ec.close_sync()
+
+
+def get_run_specific_query(query_idx: int, query_str: str) -> RunQueryCallable:
+ """
+ Runs `query_str` with an optional delay.
+ """
+
+ def run_specific_query(ctx: Context) -> None:
+ wait_for_s = ctx.prng.gauss(ctx.options.avg_gap_s, ctx.options.std_gap_s)
+ if wait_for_s < 0.0:
+ wait_for_s = 0.0
+ time.sleep(wait_for_s)
+
+ try:
+ start = time.time()
+ ctx.cursor.execute_sync(query_str)
+ ctx.cursor.fetchall_sync()
+ end = time.time()
+ print(
+ "{},{}".format(query_idx, end - start),
+ file=ctx.output_file,
+ flush=True,
+ )
+
+ except Exception as ex:
+ print(
+ "Skipping query {} because of an error (potentially timeout)".format(
+ query_idx
+ ),
+ file=sys.stderr,
+ flush=True,
+ )
+ print(ex, file=sys.stderr, flush=True)
+
+ return run_specific_query