Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add changing SLO experiment scripts and definitions #494

Merged
merged 8 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions experiments/15-e2e-scenarios-v2/slo_change/COND
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
include("../common.cond")

QUERIES = [99, 56, 32, 92, 91, 49, 30, 83, 94, 38, 87, 86, 76, 37, 31, 46, 58, 61, 62, 64, 69, 73, 74, 51, 57, 60]

COMMON_CONFIGS = {
"physical-config-file": "config/physical_config_100gb.yml",
"schema-name": "imdb_extended_100g",
"ra-query-bank-file": IMDB_100GB_REGULAR_QUERY_BANK,
"txn-scale-factor": IMDB_100GB_SF,
"num-front-ends": 24,
"dataset-type": "100gb",
"ra-query-indexes": ",".join(map(str, QUERIES))
}

run_experiment(
name="brad_100g",
run="./run_workload.sh",
options={
"system-config-file": "slo_change_config.yml",
**COMMON_CONFIGS,
},
)

run_command(
name="brad_100g_debug",
run="./run_workload_debug.sh",
options={
"system-config-file": "slo_change_config.yml",
**COMMON_CONFIGS,
},
)
48 changes: 48 additions & 0 deletions experiments/15-e2e-scenarios-v2/slo_change/run_workload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#! /bin/bash

script_loc=$(cd $(dirname $0) && pwd -P)
cd $script_loc
source ../common.sh

# Arguments:
# --system-config-file
# --physical-config-file
# --query-indexes
extract_named_arguments $@

start_brad $system_config_file $physical_config_file
log_workload_point "brad_start_initiated"
sleep 30

log_workload_point "clients_starting"
# 8 clients, offset 16 (for the transactional clients)
start_repeating_olap_runner 8 15 5 $ra_query_indexes "ra_8" 16
rana_pid=$runner_pid

start_txn_runner_serial 16 # Implicit: --dataset-type
txn_pid=$runner_pid

log_workload_point "clients_started"

function inner_cancel_experiment() {
cancel_experiment $rana_pid $txn_pid
}

trap "inner_cancel_experiment" INT
trap "inner_cancel_experiment" TERM

# Sleep for 10 minutes and then change the SLOs.
sleep $(( 10 * 60 ))

log_workload_point "changing_slo"
brad cli --command "BRAD_CHANGE_SLO 30.0 0.030"
log_workload_point "changed_slo"

# Wait another hour before stopping.
sleep $(( 60 * 60 ))

# Shut down everything now.
log_workload_point "experiment_workload_done"
>&2 echo "Experiment done. Shutting down runners..."
graceful_shutdown $rana_pid $txn_pid
log_workload_point "shutdown_complete"
49 changes: 49 additions & 0 deletions experiments/15-e2e-scenarios-v2/slo_change/run_workload_debug.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#! /bin/bash

script_loc=$(cd $(dirname $0) && pwd -P)
cd $script_loc
source ../common.sh

# Arguments:
# --system-config-file
# --physical-config-file
# --query-indexes
extract_named_arguments $@

export BRAD_IGNORE_BLUEPRINT=1
start_brad_debug $system_config_file $physical_config_file
log_workload_point "brad_start_initiated"
sleep 30

log_workload_point "clients_starting"
# 8 clients, offset 16 (for the transactional clients)
start_repeating_olap_runner 8 15 5 $ra_query_indexes "ra_8" 16
rana_pid=$runner_pid

start_txn_runner_serial 16 # Implicit: --dataset-type
txn_pid=$runner_pid

log_workload_point "clients_started"

function inner_cancel_experiment() {
cancel_experiment $rana_pid $txn_pid
}

trap "inner_cancel_experiment" INT
trap "inner_cancel_experiment" TERM

# Sleep for 2 minutes and then change the SLOs.
sleep $(( 2 * 60 ))

log_workload_point "changing_slo"
brad cli --command "BRAD_CHANGE_SLO 30.0 0.030"
log_workload_point "changed_slo"

# Wait another 10 mins before stopping.
sleep $(( 10 * 60 ))

# Shut down everything now.
log_workload_point "experiment_workload_done"
>&2 echo "Experiment done. Shutting down runners..."
graceful_shutdown $rana_pid $txn_pid
log_workload_point "shutdown_complete"
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#! /bin/bash

if [ -z $1 ]; then
>&2 echo "Usage: $0 path/to/physical/config.yml"
exit 1
fi

script_loc=$(cd $(dirname $0) && pwd -P)
cd $script_loc
source ../common.sh

python3 ../../../workloads/IMDB_extended/set_up_starting_blueprint.py \
--schema-name imdb_extended_100g \
--query-bank-file ../../../workloads/IMDB_100GB/regular_test/queries.sql \
--aurora-queries "99,56,32,92,91" \
--redshift-queries "49,30,83,94,38,87,86,76,37,31,46,58,61,62,64,69,73,74,51,57,60" \
--redshift-provisioning "dc2.large:2" \
--aurora-provisioning "db.t4g.medium:2" \
--system-config-file slo_change_config.yml \
--physical-config-file $1
168 changes: 168 additions & 0 deletions experiments/15-e2e-scenarios-v2/slo_change/slo_change_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# This file contains configurations that are used by BRAD. These are default
# values and should be customized for specific situations.

# BRAD's front end servers will listen for client connections on this interface
# and port. If `num_front_ends` is greater than one, subsequent front ends will
# listen on successive ports (e.g., 6584, 6585, etc.).
front_end_interface: "0.0.0.0"
front_end_port: 6583
num_front_ends: 24

# Logging paths. If the value is in ALL_CAPS (with underscores), it is
# interpreted as an environment variable (BRAD will log to the path stored in
# the environment variable).

# Where BRAD's daemon process will write its logs.
daemon_log_file: COND_OUT

# Where BRAD's front end processes will write their logs.
front_end_log_path: COND_OUT

# Where BRAD's blueprint planner will write debug logs.
planner_log_path: COND_OUT

# Where BRAD's metrics loggers will write their logs.
metrics_log_path: COND_OUT

# Probability that each transactional query will be logged.
txn_log_prob: 0.01

# Set to a non-zero value enable automatic data syncing. When this is set to 0,
# automatic syncing is disabled.
data_sync_period_seconds: 0

# BRAD's front end servers will report their metrics at regular intervals.
front_end_metrics_reporting_period_seconds: 30
front_end_query_latency_buffer_size: 100

# `default` means to use the policy encoded in the blueprint. Other values will
# override the blueprint.
routing_policy: default

# Whether to disable table movement for benchmark purposes (i.e., keep all
# tables on all engines.)
disable_table_movement: true

# Epoch length for metrics and forecasting. This is the granularity at which
# metrics/forecasting will be performed.
epoch_length:
weeks: 0
days: 0
hours: 0
minutes: 1

# Blueprint planning strategy.
strategy: fp_query_based_beam

# Used to specify the period of time over which to use data for planning.
# Currrently, this is a "look behind" window for the workload.
planning_window:
weeks: 0
days: 0
hours: 1
minutes: 0

# Used to aggregate metrics collected in the planning window.
metrics_agg:
method: ewm # 'mean' is another option
alpha: 0.86466472 # 1 - 1 / e^2

# Used during planning.
reinterpret_second_as: 1

# The query distribution must change by at least this much for a new blueprint
# to be accepted.
query_dist_change_frac: 0.1

# The search bound for the provisioning.
max_provisioning_multiplier: 2.5

# Flag options for blueprint planning.
use_io_optimized_aurora: true
use_recorded_routing_if_available: true
ensure_tables_together_on_one_engine: true

# Loads used to prime the system when no information is available.
aurora_initialize_load_fraction: 0.25
redshift_initialize_load_fraction: 0.25

# BRAD will not reduce predicted load lower than these values. Raise these
# values to be more conservative against mispredictions.
aurora_min_load_removal_fraction: 0.8
redshift_min_load_removal_fraction: 0.9

# Blueprint planning performance ceilings.
# These will change to 30 s and 30 ms during the experiment.
query_latency_p90_ceiling_s: 60.0
txn_latency_p90_ceiling_s: 0.060

# If set to true, BRAD will attempt to use the specified preset Redshift
# clusters instead of resizing the main Redshift cluster.
use_preset_redshift_clusters: true

# Used for ordering blueprints during planning.
comparator:
type: benefit_perf_ceiling # or `perf_ceiling`

benefit_horizon: # Only used by the `benefit_perf_ceiling` comparator
weeks: 0
days: 0
hours: 3
minutes: 0

penalty_threshold: 0.8 # Only used by the `benefit_perf_ceiling` comparator
penalty_power: 8 # Only used by the `benefit_perf_ceiling` comparator

# Used for precomputed predictions.
std_datasets:
- name: regular
path: workloads/IMDB_100GB/regular_test/
- name: adhoc
path: workloads/IMDB_100GB/adhoc_test/

aurora_max_query_factor: 4.0
aurora_max_query_factor_replace: 10000.0

redshift_peak_load_threshold: 95.0
redshift_peak_load_multiplier: 2.0

planner_max_workers: 16
aurora_provisioning_search_distance: 900.0
redshift_provisioning_search_distance: 900.0

# Blueprint planning trigger configs.

triggers:
enabled: true
check_period_s: 90 # Triggers are checked every X seconds.
check_period_offset_s: 600 # Wait 10 mins before starting.
observe_new_blueprint_mins: 10

elapsed_time:
disabled: true
multiplier: 60 # Multiplier over `planning_window`.

redshift_cpu:
lo: 15
hi: 85
sustained_epochs: 3

aurora_cpu:
lo: 15
hi: 85
sustained_epochs: 3

variable_costs:
disabled: true
threshold: 1.0

query_latency_ceiling:
ceiling_s: 60.0
sustained_epochs: 3

txn_latency_ceiling:
ceiling_s: 0.060
sustained_epochs: 3

recent_change:
delay_epochs: 5
4 changes: 4 additions & 0 deletions src/brad/config/system_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@ class SystemEvent(enum.Enum):

# Use this for long running experiments.
ReachedExpectedState = "reached_expected_state"

# Used when a service level objective is changed while BRAD is running (used
# for experiments).
ChangedSlos = "changed_slos"
8 changes: 7 additions & 1 deletion src/brad/daemon/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ async def _handle_internal_command(self, command: str) -> RowList:
parts = command.split(" ")
if self._temp_config is None:
return [("Cannot change SLOs because TempConfig is missing.",)]
if len(parts) <= 3:
if len(parts) < 3:
return [("Need to specify query and txn p90 SLOs",)]

query_p90_s = float(parts[1])
Expand All @@ -757,6 +757,12 @@ async def _handle_internal_command(self, command: str) -> RowList:
elif isinstance(t, TransactionLatencyCeiling):
t.set_latency_ceiling(txn_p90_s)

if self._system_event_logger is not None:
self._system_event_logger.log(
SystemEvent.ChangedSlos,
f"query_p90_s={query_p90_s}; txn_p90_s={txn_p90_s}",
)

return [
(
f"p90 SLOs changed to (query {query_p90_s:.3f} s), (txn {txn_p90_s:.3f} s)",
Expand Down
Loading