Skip to content

Commit

Permalink
Merge pull request #33 from satra/master
Browse files Browse the repository at this point in the history
enh: add trained model saving
  • Loading branch information
satra authored Dec 8, 2020
2 parents 4078bf7 + 68b2953 commit 5833902
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 2 deletions.
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,19 @@ Each model contains:
amount of predictions and F the different SHAP values for each feature.
`shaps` is empty if `gen_shap` is set to `false` or if `permute` is set
to true.
- `model`: A pickled version of the model trained on all the input data.
One can use this model to test on new data that has the exact same input
shape and features as the trained model. For example:
```python
import pickle as pk
import numpy as np
with open("results-20201208T010313.229190.pkl", "rb") as fp:
data = pk.load(fp)
trained_model = data[0][1].output.model
trained_model.predict(np.random.rand(1, 30))
```
Please check the value of `data[N][0]` to ensure that you are not using
a permuted model.
- One figure per metric with performance distribution across splits (with or
without null distribution trained on permuted labels)
- One figure per any metric with the word `score` in it reporting the results of
Expand Down Expand Up @@ -202,7 +215,7 @@ The actual numeric values are stored in a correspondingly named pkl file.
## Debugging

You will need to understand a bit of pydra to know how to debug this application for
now. If the process crashes, the easiest way to restart is to remove the `cache-wf`
now. If the process crashes, the easiest way to restart is to remove the `cache-wf`
folder first. However, if you are rerunning, you could also remove any `.lock` file
in the `cache-wf`directory.

Expand Down
23 changes: 22 additions & 1 deletion pydra_ml/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@
from pydra.utils.messenger import AuditFlag, FileMessenger
import typing as ty
import os
from .tasks import read_file, gen_splits, train_test_kernel, calc_metric, get_shap
from .tasks import (
read_file,
gen_splits,
train_test_kernel,
calc_metric,
get_shap,
create_model,
)
from .report import gen_report

# Create pydra tasks
Expand Down Expand Up @@ -36,6 +43,10 @@

get_shap_pdt = task(annotate({"return": {"shaps": ty.Any}})(get_shap))

create_model_pdt = task(
annotate({"return": {"output": ty.Any, "model": ty.Any}})(create_model)
)


def gen_workflow(inputs, cache_dir=None, cache_locations=None):
wf = pydra.Workflow(
Expand Down Expand Up @@ -98,12 +109,22 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
)
)
wf.shap.combine("fit_clf.split_index")
wf.add(
create_model_pdt(
name="create_model",
X=wf.readcsv.lzout.X,
y=wf.readcsv.lzout.Y,
clf_info=wf.lzin.clf_info,
permute=wf.lzin.permute,
)
)
wf.set_output(
[
("output", wf.metric.lzout.output),
("score", wf.metric.lzout.score),
("shaps", wf.shap.lzout.shaps),
("feature_names", wf.readcsv.lzout.feature_names),
("model", wf.create_model.lzout.model),
]
)
return wf
Expand Down
46 changes: 46 additions & 0 deletions pydra_ml/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,49 @@ def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"):
explainer = shap.KernelExplainer(pipe.predict, shap.kmeans(X[train_index], 5))
shaps = explainer.shap_values(X[test_index], nsamples=nsamples, l1_reg=l1_reg)
return shaps


def create_model(X, y, clf_info, permute):
"""Train a model with all the data
:param X: Input features
:param y: Target variables
:param clf_info: how to construct the classifier
:param permute: whether to run it in permuted mode or not
:return: training error, classifier
"""
from sklearn.pipeline import Pipeline
import numpy as np

def to_instance(clf_info):
mod = __import__(clf_info[0], fromlist=[clf_info[1]])
params = {}
if len(clf_info) > 2:
params = clf_info[2]
clf = getattr(mod, clf_info[1])(**params)
if len(clf_info) == 4:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(clf, param_grid=clf_info[3])
return clf

if isinstance(clf_info[0], list):
# Process as a pipeline constructor
steps = []
for val in clf_info:
step = to_instance(val)
steps.append((val[1], step))
pipe = Pipeline(steps)
else:
clf = to_instance(clf_info)
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([("std", StandardScaler()), (clf_info[1], clf)])

y = y.ravel()
if permute:
pipe.fit(X, y[np.random.permutation(range(len(y)))])
else:
pipe.fit(X, y)
predicted = pipe.predict(X)
return (y, predicted), pipe
5 changes: 5 additions & 0 deletions pydra_ml/tests/test_classifier.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from ..classifier import gen_workflow, run_workflow
import numpy as np


def test_classifier(tmpdir):
Expand Down Expand Up @@ -32,6 +33,8 @@ def test_classifier(tmpdir):
assert results[0][0]["ml_wf.clf_info"][1] == "MLPClassifier"
assert results[0][0]["ml_wf.permute"]
assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0]
assert hasattr(results[2][1].output.model, "predict")
assert isinstance(results[2][1].output.model.predict(np.ones((1, 30))), np.ndarray)


def test_regressor(tmpdir):
Expand Down Expand Up @@ -69,3 +72,5 @@ def test_regressor(tmpdir):
assert results[0][0]["ml_wf.clf_info"][-1][1] == "MLPRegressor"
assert results[0][0]["ml_wf.permute"]
assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0]
assert hasattr(results[2][1].output.model, "predict")
assert isinstance(results[2][1].output.model.predict(np.ones((1, 10))), np.ndarray)

0 comments on commit 5833902

Please sign in to comment.