From 495074fab0f07ad70b3b5ec99e486802d884d51f Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Mon, 13 Dec 2021 17:46:30 -0800 Subject: [PATCH] Misc Improvement (#550) * ... * minor --- .../tvm/meta_schedule/cost_model/xgb_model.py | 21 +++++---- python/tvm/meta_schedule/tune.py | 15 +++---- src/meta_schedule/utils.h | 45 ++++++++++++++----- .../meta_schedule/run_meta_schedule_cuda.sh | 11 +++-- .../meta_schedule/test_meta_schedule.py | 43 +++++++++--------- 5 files changed, 82 insertions(+), 53 deletions(-) diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py index a5e5010731..6b833963f3 100644 --- a/python/tvm/meta_schedule/cost_model/xgb_model.py +++ b/python/tvm/meta_schedule/cost_model/xgb_model.py @@ -319,8 +319,11 @@ def load(self, path: str) -> None: Since XGBoost model trains from scratch, each time we can only load the model without the previous cached features / results so any call of update won't use previous training data. """ + import xgboost as xgb # pylint: disable=import-outside-toplevel + with tempfile.TemporaryDirectory() as tmp_dir: untar(path, tmp_dir) + self.booster = xgb.Booster() self.booster.load_model(os.path.join(tmp_dir, "model.bin")) self.cached_features = list( np.load(os.path.join(tmp_dir, "cached_features.npy"), allow_pickle=True) @@ -346,23 +349,24 @@ def save(self, path: str) -> None: import xgboost as xgb # pylint: disable=import-outside-toplevel if self.booster is None: - # save all the paramaters + # save all the parameters self.booster = xgb.Booster(self.config.to_dict()) - with tempfile.TemporaryDirectory() as tmpdirname: - self.booster.save_model(os.path.join(tmpdirname, "model.bin")) + with tempfile.TemporaryDirectory() as tmp_dir: + self.booster.save_model(os.path.join(tmp_dir, "model.bin")) np.save( - os.path.join(tmpdirname, "cached_features.npy"), + os.path.join(tmp_dir, "cached_features.npy"), np.array(self.cached_features, dtype=object), ) - np.save(os.path.join(tmpdirname, "cached_mean_costs.npy"), self.cached_mean_costs) + np.save(os.path.join(tmp_dir, "cached_mean_costs.npy"), self.cached_mean_costs) tar( path, [ - os.path.join(tmpdirname, "model.bin"), - os.path.join(tmpdirname, "cached_features.npy"), - os.path.join(tmpdirname, "cached_mean_costs.npy"), + os.path.join(tmp_dir, "model.bin"), + os.path.join(tmp_dir, "cached_features.npy"), + os.path.join(tmp_dir, "cached_mean_costs.npy"), ], ) + logger.info("Saved XGBModel to %s", path) def update( self, @@ -491,7 +495,6 @@ def average_peak_score( ) del self.d_train - # todo(zxybazh): measure callback to save the model def _predict( # type: ignore # pylint: disable=invalid-name self, diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index 8eec8cf2bf..4b42510773 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -256,10 +256,10 @@ def _runner(runner: Optional[Runner]) -> Runner: return runner @staticmethod - def _database(database: Union[None, Database], path: str) -> Database: + def _database(database: Union[None, Database], task_name: str, path: str) -> Database: if database is None: - path_workload = os.path.join(path, "workload.json") - path_tuning_record = os.path.join(path, "tuning_record.json") + path_workload = os.path.join(path, f"{task_name}_database_workload.json") + path_tuning_record = os.path.join(path, f"{task_name}_database_tuning_record.json") logger.info( "Creating JSONDatabase. Workload at: %s. Tuning records at: %s", path_workload, @@ -269,8 +269,6 @@ def _database(database: Union[None, Database], path: str) -> Database: path_workload=path_workload, path_tuning_record=path_tuning_record, ) - elif callable(database): - database = database(path) if not isinstance(database, Database): raise TypeError(f"Expected `database` to be Database, but gets: {database}") return database @@ -496,7 +494,7 @@ def tune_tir( logger.info("Working directory: %s", work_dir) # pylint: disable=protected-access mod = Parse._mod(mod) - database = Parse._database(database, work_dir) + database = Parse._database(database, task_name, work_dir) tune_context = Parse._tune_context( tune_context=None, mod=mod, @@ -529,6 +527,7 @@ def tune_tir( assert len(bests) == 1 sch = Schedule(mod) bests[0].trace.apply_to_schedule(sch, remove_postproc=False) + task_scheduler.cost_model.save(os.path.join(work_dir, f"{task_name}.xgb")) return sch @@ -663,7 +662,7 @@ def tune_relay( # pylint: disable=protected-access tune_contexts = [] target = Parse._target(target) - database = Parse._database(database, work_dir) + database = Parse._database(database, task_name, work_dir) for task in extracted_tasks: assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now" mod = Parse._mod(task.dispatched[0]) @@ -692,7 +691,7 @@ def tune_relay( ) # pylint: enable=protected-access task_scheduler.tune() - schs = [] + schs: List[Schedule] = [] for task in tune_contexts: mod = task.mod workload = database.commit_workload(mod) diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h index a8e2733ed6..c68dfbd31c 100644 --- a/src/meta_schedule/utils.h +++ b/src/meta_schedule/utils.h @@ -278,17 +278,23 @@ inline int GetTargetNumCores(const Target& target) { return num_cores; } +/*! + * \brief A helper data structure that replays a trace and collects failure counts + * for each postprocessor + */ struct ThreadedTraceApply { - const Array& postprocs; - std::vector>> fail_counter; - + /*! \brief Constructor */ explicit ThreadedTraceApply(const Array& postprocs) - : postprocs(postprocs), fail_counter(postprocs.size()) { - for (std::unique_ptr>& p : fail_counter) { - p = std::make_unique>(0); + : n_(postprocs.size()), items_(new Item[n_]) { + for (int i = 0; i < n_; ++i) { + items_[i].postproc = postprocs[i]; + items_[i].fail_counter = 0; } } + /*! \brief Destructor */ + ~ThreadedTraceApply() { delete[] items_; } + /*! * \brief Apply the trace and postprocessors to an IRModule * \param mod The IRModule to be applied @@ -305,23 +311,38 @@ struct ThreadedTraceApply { /*error_render_level=*/tir::ScheduleErrorRenderLevel::kNone); trace->ApplyToSchedule(sch, /*remove_postproc=*/true); sch->EnterPostproc(); - for (int i = 0, n = postprocs.size(); i < n; ++i) { - if (!postprocs[i]->Apply(sch)) { - ++*fail_counter[i]; + for (int i = 0; i < n_; ++i) { + Item& item = items_[i]; + if (!item.postproc->Apply(sch)) { + ++item.fail_counter; return NullOpt; } } return sch; } + /*! \brief Returns a string summarizing the failures on each postprocessor */ std::string SummarizeFailures() const { std::ostringstream os; - for (int i = 0, n = postprocs.size(); i < n; ++i) { - os << "Postproc #" << i << " [" << postprocs[i] // - << "]: " << *fail_counter[i] << " failure(s)\n"; + for (int i = 0; i < n_; ++i) { + const Item& item = items_[i]; + os << "Postproc #" << i << " [" << item.postproc // + << "]: " << item.fail_counter.load() << " failure(s)"; + if (i != n_ - 1) { + os << "\n"; + } } return os.str(); } + + private: + struct Item { + Postproc postproc{nullptr}; + std::atomic fail_counter{0}; + }; + + int n_; + Item* items_; }; } // namespace meta_schedule diff --git a/tests/python/meta_schedule/run_meta_schedule_cuda.sh b/tests/python/meta_schedule/run_meta_schedule_cuda.sh index 87eec9c49b..f01b029235 100644 --- a/tests/python/meta_schedule/run_meta_schedule_cuda.sh +++ b/tests/python/meta_schedule/run_meta_schedule_cuda.sh @@ -1,25 +1,28 @@ -# set -euxo pipefail +set -euxo pipefail RPC_HOST="192.168.6.66" RPC_PORT="4445" RPC_KEY="jetson-agx-xavier" TARGET="nvidia/jetson-agx-xavier" -LOG_DIR=$HOME/logs/ms-cuda/ -NUM_TRIALS=800 +LOG_DIR=/tmp/logs/ms-cuda/ +NUM_TRIALS=2000 mkdir -p $LOG_DIR run () { name=$1 + work_dir=$LOG_DIR/$name/ + mkdir -p $work_dir echo "Running workload $name" python tests/python/meta_schedule/test_meta_schedule.py \ --workload "$name" \ --target "$TARGET" \ + --work-dir "$work_dir" \ --rpc-host "$RPC_HOST" \ --rpc-port "$RPC_PORT" \ --rpc-key "$RPC_KEY" \ --num-trials $NUM_TRIALS \ - 2>&1 | tee "$LOG_DIR/$name.log" + 2>&1 | tee "$work_dir/$name.log" } # Single op diff --git a/tests/python/meta_schedule/test_meta_schedule.py b/tests/python/meta_schedule/test_meta_schedule.py index bc519b5305..64f7034546 100644 --- a/tests/python/meta_schedule/test_meta_schedule.py +++ b/tests/python/meta_schedule/test_meta_schedule.py @@ -18,7 +18,6 @@ import argparse import logging from os import cpu_count -import tempfile import tvm from tvm import meta_schedule as ms @@ -43,6 +42,11 @@ def _parse_args(): type=int, required=True, ) + args.add_argument( + "--work-dir", + type=str, + required=True, + ) args.add_argument( "--rpc-host", type=str, @@ -85,25 +89,24 @@ def main(): alloc_repeat=3, max_workers=ARGS.rpc_workers, ) - with tempfile.TemporaryDirectory() as work_dir: - sch: tir.Schedule = ms.tune_tir( - mod=create_te_workload(ARGS.workload, 0), - target=ARGS.target, - config=ms.EvolutionarySearchConfig( - num_trials_per_iter=64, - num_trials_total=ARGS.num_trials, - init_max_fail_count=1024, - ), - runner=runner, - task_name=ARGS.workload, - work_dir=work_dir, - num_threads=cpu_count(), - ) - if sch is None: - print("No valid schedule found!") - else: - print(sch.mod.script()) - print(sch.trace) + sch: tir.Schedule = ms.tune_tir( + mod=create_te_workload(ARGS.workload, 0), + target=ARGS.target, + config=ms.EvolutionarySearchConfig( + num_trials_per_iter=64, + num_trials_total=ARGS.num_trials, + init_max_fail_count=8192, + ), + runner=runner, + task_name=ARGS.workload, + work_dir=ARGS.work_dir, + num_threads=cpu_count(), + ) + if sch is None: + print("No valid schedule found!") + else: + print(sch.mod.script()) + print(sch.trace) if __name__ == "__main__":