cumc-dbmi · ChaoPang · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,39 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python application
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10.0
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        pip install -e .
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        PYTHONPATH=./: pytest
diff --git a/README.md b/README.md
@@ -73,7 +73,8 @@ tar -xvf omop_synthea.tar .
 ```
 Convert the OMOP dataset to the MEDS format
 ```console
-meds_etl_omop omop_synthea synthea_meds
+pip install meds_etl==0.3.6;
+meds_etl_omop omop_synthea synthea_meds;
 ```
 Convert MEDS to the meds_reader database to get the patient level data
 ```console

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,9 +26,6 @@ dependencies = [
     "dask==2024.1.1",
     "dask[dataframe]==2024.1.1",
     "datasets==2.16.1",
-    "docarray==0.40.0",
-    "docarray[hnswlib]==0.40.0",
-    "docarray[weaviate]==0.40.0",
     "evaluate==0.4.1",
     "fast-ml==3.68",
     "fastparquet==0.8.1",
@@ -58,7 +55,6 @@ dependencies = [
     "transformers==4.39.3",
     "Werkzeug==3.0.1",
     "wandb==0.17.8",
-    "Whoosh==2.7.4",
     "xgboost==2.0.3"
 ]
 

diff --git a/src/cehrbert/config/output_names.py b/src/cehrbert/config/output_names.py
@@ -1,9 +1,9 @@
-parquet_data_path = 'patient_sequence'
-qualified_concept_list_path = 'qualified_concept_list'
-time_attention_model_path = 'time_aware_model.h5'
-bert_model_validation_path = 'bert_model.h5'
-mortality_data_path = 'mortality'
-heart_failure_data_path = 'heart_failure'
-hospitalization_data_path = 'hospitalization'
-information_content_data_path = 'information_content'
-concept_similarity_path = 'concept_similarity'
+PARQUET_DATA_PATH = 'patient_sequence'
+QUALIFIED_CONCEPT_LIST_PATH = 'qualified_concept_list'
+TIME_ATTENTION_MODEL_PATH = 'time_aware_model.h5'
+BERT_MODEL_VALIDATION_PATH = 'bert_model.h5'
+MORTALITY_DATA_PATH = 'mortality'
+HEART_FAILURE_DATA_PATH = 'heart_failure'
+HOSPITALIZATION_DATA_PATH = 'hospitalization'
+INFORMATION_CONTENT_DATA_PATH = 'information_content'
+CONCEPT_SIMILARITY_PATH = 'concept_similarity'
diff --git a/src/cehrbert/evaluations/evaluation.py b/src/cehrbert/evaluations/evaluation.py
@@ -56,7 +56,7 @@ def evaluate_sequence_models(args):
         time_attention_tokenizer_path = find_tokenizer_path(args.time_attention_model_folder)
         time_aware_model_path = os.path.join(
             args.time_attention_model_folder,
-            p.time_attention_model_path
+            p.TIME_ATTENTION_MODEL_PATH
         )
         BiLstmModelEvaluator(
             dataset=dataset,
@@ -83,7 +83,7 @@ def evaluate_sequence_models(args):
         validate_folder(args.vanilla_bert_model_folder)
         bert_tokenizer_path = find_tokenizer_path(args.vanilla_bert_model_folder)
         bert_model_path = os.path.join(args.vanilla_bert_model_folder,
-                                       p.bert_model_validation_path)
+                                       p.BERT_MODEL_VALIDATION_PATH)
         BertFeedForwardModelEvaluator(
             dataset=dataset,
             evaluation_folder=args.evaluation_folder,
@@ -108,7 +108,7 @@ def evaluate_sequence_models(args):
         validate_folder(args.vanilla_bert_model_folder)
         bert_tokenizer_path = find_tokenizer_path(args.vanilla_bert_model_folder)
         bert_model_path = os.path.join(args.vanilla_bert_model_folder,
-                                       p.bert_model_validation_path)
+                                       p.BERT_MODEL_VALIDATION_PATH)
         SlidingBertModelEvaluator(
             dataset=dataset,
             evaluation_folder=args.evaluation_folder,
@@ -134,7 +134,7 @@ def evaluate_sequence_models(args):
         validate_folder(args.vanilla_bert_model_folder)
         bert_tokenizer_path = find_tokenizer_path(args.vanilla_bert_model_folder)
         bert_model_path = os.path.join(args.vanilla_bert_model_folder,
-                                       p.bert_model_validation_path)
+                                       p.BERT_MODEL_VALIDATION_PATH)
         BertLstmModelEvaluator(
             dataset=dataset,
             evaluation_folder=args.evaluation_folder,
@@ -160,7 +160,7 @@ def evaluate_sequence_models(args):
     if RANDOM_VANILLA_BERT_LSTM in args.model_evaluators:
         validate_folder(args.vanilla_bert_model_folder)
         bert_model_path = os.path.join(args.vanilla_bert_model_folder,
-                                       p.bert_model_validation_path)
+                                       p.BERT_MODEL_VALIDATION_PATH)
         bert_tokenizer_path = find_tokenizer_path(args.vanilla_bert_model_folder)
         visit_tokenizer_path = find_visit_tokenizer_path(args.vanilla_bert_model_folder)
 
@@ -195,7 +195,7 @@ def evaluate_sequence_models(args):
     if HIERARCHICAL_BERT_LSTM in args.model_evaluators:
         validate_folder(args.vanilla_bert_model_folder)
         bert_model_path = os.path.join(args.vanilla_bert_model_folder,
-                                       p.bert_model_validation_path)
+                                       p.BERT_MODEL_VALIDATION_PATH)
 
         bert_tokenizer_path = find_tokenizer_path(args.vanilla_bert_model_folder)
         bert_visit_tokenizer_path = find_visit_tokenizer_path(args.vanilla_bert_model_folder)
@@ -227,7 +227,7 @@ def evaluate_sequence_models(args):
     if HIERARCHICAL_BERT_POOLING in args.model_evaluators:
         validate_folder(args.vanilla_bert_model_folder)
         bert_model_path = os.path.join(args.vanilla_bert_model_folder,
-                                       p.bert_model_validation_path)
+                                       p.BERT_MODEL_VALIDATION_PATH)
 
         bert_tokenizer_path = find_tokenizer_path(args.vanilla_bert_model_folder)
         bert_visit_tokenizer_path = find_visit_tokenizer_path(args.vanilla_bert_model_folder)
@@ -259,7 +259,7 @@ def evaluate_sequence_models(args):
     if RANDOM_HIERARCHICAL_BERT_LSTM in args.model_evaluators:
         validate_folder(args.vanilla_bert_model_folder)
         bert_model_path = os.path.join(args.vanilla_bert_model_folder,
-                                       p.bert_model_validation_path)
+                                       p.BERT_MODEL_VALIDATION_PATH)
 
         bert_tokenizer_path = find_tokenizer_path(args.vanilla_bert_model_folder)
         bert_visit_tokenizer_path = find_visit_tokenizer_path(args.vanilla_bert_model_folder)