Skip to content

Commit

Permalink
Release v12.6.0 (#418)
Browse files Browse the repository at this point in the history
* Make consortium releases = public releases, add dashboard plots/tables (#385)

* Remove null oncotree codes in consortium release

* Filter out flagged mutations

* Add to dashboard plot

* Correct queries

* Subset

* Update description

* Add figure

* Fix

* Fix

* Add to dashboard

* Fix text

* Add variant counts

* Add index

* Add non somatic mutation check

* Add headers

* Add SNVs that were annotatoed as DNP

* Fix

* Store as file

* Fix

* Fix

* Fix text

* Add spacing

* update

* Add

* Only upload subset

* Add in files to release

* Don't filter out blacklist variants

* Use mutation file

* Correct

* Edit

* Fix

* Fix

* Update genie/database_to_staging.py

* Fix columns

* Add

* Remove comments

* Push

* Add 'Not released' value (#389)

* Add Not released value

* Use stringIO, fix tests

* Calculate missing counts (#388)

* Calculate missing counts

* Fix

* Update genie/database_to_staging.py

* Add function to calculate missing variant counts

* No need for blindtext

* Fix

* Shuffle sections

* Fix

* Fix missing variant counts

* Fix section

* Fix

* Fix headers

* Fix docker (#396)

* Fix docker

* Add comment

* Fix

* Fix

* Fix

Co-authored-by: EC2 Default User <[email protected]>

* Update dev

* Remove 3.6 support

* Don't remove 3.6 in dev branch

* check for vital status inconsistencies (#416)

* check for vital status inconsitencies

* Fix tests

* Add test

* Remove

* Add validation rules for vital status

* Remove redundant returns

* lint

* Check for inconsistent year and interval vital status columns

* Fix tests

* Add tests for vital status interval and dead check

* Add tests

* Fix tests

* Add tests

* Update Dockerfile (#417)

* Update dockerfile to use new ubuntu

* Remove py36 support as EOL

* update ubuntu version

* Update requirements

* install python dev

* Fix

* Update genie/__version__.py

Co-authored-by: EC2 Default User <[email protected]>
  • Loading branch information
thomasyu888 and EC2 Default User authored Jan 9, 2022
1 parent 5fe308e commit 4cacd9b
Show file tree
Hide file tree
Showing 6 changed files with 314 additions and 54 deletions.
71 changes: 41 additions & 30 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,37 +1,48 @@
FROM ubuntu:bionic
FROM ubuntu:hirsute-20211107
ENV DEBIAN_FRONTEND=noninteractive

# Must install this because gpg not installed
RUN apt-get update && apt-get install -y gnupg2 software-properties-common
RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/'
RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common \
dirmngr \
wget && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

#RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc
RUN add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"

# General sys dependencies
RUN apt-get update && apt-get install -y --allow-unauthenticated \
bedtools \
dos2unix \
wget \
python3 \
python3-pip \
git \
r-base-core \
r-base-dev \
curl \
#synapser client dependencies
dpkg-dev \
zlib1g-dev \
libssl-dev \
libffi-dev \
libcurl4-openssl-dev \
# VariantAnnotation dependency
libxml2-dev \
# Supports data guide creation
texlive \
texinfo \
texlive-generic-recommended \
texlive-latex-extra \
# genome nexus
openjdk-8-jre
RUN apt-get update && apt-get install -y --allow-unauthenticated --no-install-recommends \
bedtools \
dos2unix \
python3 \
python3-pip \
python3-dev \
git \
r-base \
r-base-dev \
curl \
# synapser client dependencies
dpkg-dev \
zlib1g-dev \
libssl-dev \
libffi-dev \
libcurl4-openssl-dev \
# VariantAnnotation dependency
libxml2-dev \
# Supports data guide creation
texlive \
texinfo \
# texlive-generic-recommended \
texlive-latex-extra \
# genome nexus
openjdk-8-jre && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

#install pandoc 1.19.2.1 (dashboard use)
RUN wget https://github.com/jgm/pandoc/releases/download/1.19.2.1/pandoc-1.19.2.1-1-amd64.deb
Expand All @@ -47,7 +58,7 @@ RUN Rscript R/install_packages.R

RUN pip3 install --no-cache-dir cython
RUN pip3 install --no-cache-dir -r requirements.txt
RUN pip3 install -e .
RUN pip3 install --no-cache-dir -e .
# RUN python3 setup.py sdist
# RUN python3 setup.py develop

Expand Down
2 changes: 1 addition & 1 deletion genie/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "12.5.0"
__version__ = "12.6.0"
116 changes: 115 additions & 1 deletion genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,102 @@ def _check_year(clinicaldf: pd.DataFrame, year_col: int, filename: str,
return error


def _check_int_dead_consistency(clinicaldf: pd.DataFrame) -> str:
"""Check if vital status interval and dead column are consistent
Args:
clinicaldf: Clinical Data Frame
Returns:
Error message if values and inconsistent or blank string
"""
cols = ["INT_DOD", "DEAD"]
for col in cols:
# Return empty string is columns don't exist because this error
# is already handled.
if not process_functions.checkColExist(clinicaldf, col):
return ''
is_dead = clinicaldf['DEAD'].astype(str) == "True"
is_alive = clinicaldf['DEAD'].astype(str) == "False"
allowed_str = ['Unknown', 'Not Collected', 'Not Applicable',
"Not Released"]
is_str = clinicaldf['DEAD'].isin(allowed_str)
# Check that all string values are equal each other
is_equal = all(
clinicaldf.loc[is_str, "DEAD"] == clinicaldf.loc[is_str, "INT_DOD"]
)
# If dead, int column can't be Not Applicable
# If alive, int column can't have values
if (any(clinicaldf.loc[is_dead, 'INT_DOD'] == "Not Applicable") or
not all(clinicaldf.loc[is_alive, 'INT_DOD'].isin(allowed_str)) or
not is_equal):
return (
"Patient Clinical File: DEAD value is inconsistent with INT_DOD "
"for at least one patient.\n"
)
return ''


def _check_int_year_consistency(
clinicaldf: pd.DataFrame,
cols: list, string_vals: list
) -> str:
"""
Check if vital status interval and year columns are consistent in
their values
Args:
clinicaldf: Clinical Data Frame
cols: Columns in the clinical data frame
string_vals: String values that aren't integers
Returns:
Error message if values and inconsistent or blank string
"""
interval_col = ''
year_col = ''
for col in cols:
# This is assuming that interval and year columns start with
# INT/YEAR
interval_col = col if col.startswith("INT") else interval_col
year_col = col if col.startswith("YEAR") else year_col
# Return empty string is columns don't exist because this error
# is already handled.
if not process_functions.checkColExist(clinicaldf, col):
return ""

is_text_inconsistent = False
# Get index of all rows that have 'missing' values
for str_val in string_vals:
# n string values per row
n_str = (clinicaldf[cols] == str_val).sum(axis=1)
if n_str.between(0, len(cols), inclusive="neither").any():
is_text_inconsistent = True

is_redaction_inconsistent = False
# Check that the redacted values are consistent
is_redacted_int_89 = clinicaldf[interval_col] == ">32485"
is_redacted_year_89 = clinicaldf[year_col] == ">89"
is_redacted_int = clinicaldf[interval_col] == "<6570"
is_redacted_year = clinicaldf[year_col] == "<18"
if (any(is_redacted_int != is_redacted_year) or
any(is_redacted_int_89 != is_redacted_year_89)):
is_redaction_inconsistent = True

col_strs = ', '.join(cols)
if is_text_inconsistent and is_redaction_inconsistent:
return (
"Patient: you have inconsistent redaction and text "
f"values in {col_strs}.\n"
)
if is_redaction_inconsistent:
return f"Patient: you have inconsistent redaction values in {col_strs}.\n"
if is_text_inconsistent:
return f"Patient: you have inconsistent text values in {col_strs}.\n"

return ""


# PROCESSING
def remap_clinical_values(clinicaldf: pd.DataFrame, sex_mapping: pd.DataFrame,
race_mapping: pd.DataFrame,
Expand Down Expand Up @@ -242,7 +338,7 @@ def _process(self, clinical, clinicalTemplate):
# remove unwanted columns again
keep_cols_idx = remapped_clindf.columns.isin(clinicalTemplate.columns)
remapped_clindf = remapped_clindf.drop(
remapped_clindf.columns[~keep_cols_idx], 1
columns=remapped_clindf.columns[~keep_cols_idx]
)
remapped_clindf['CENTER'] = self.center
return remapped_clindf
Expand Down Expand Up @@ -702,6 +798,24 @@ def _validate(self, clinicaldf, oncotree_link):
total_error.write(
"Patient Clinical File: Must have DEAD column.\n"
)
# CHECK: contact vital status value consistency
contact_error = _check_int_year_consistency(
clinicaldf=clinicaldf,
cols=["YEAR_CONTACT", "INT_CONTACT"],
string_vals=["Not Collected", "Unknown", "Not Released"]
)
total_error.write(contact_error)

# CHECK: death vital status value consistency
death_error = _check_int_year_consistency(
clinicaldf=clinicaldf,
cols=["YEAR_DEATH", "INT_DOD"],
string_vals=["Not Collected", "Unknown", "Not Applicable",
"Not Released"]
)
total_error.write(death_error)
death_error = _check_int_dead_consistency(clinicaldf=clinicaldf)
total_error.write(death_error)

# CHECK: SAMPLE_CLASS is optional attribute
have_column = process_functions.checkColExist(clinicaldf,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
synapseclient>=2.3.0
pandas>=1.0
httplib2>=0.11.3
pycrypto>=2.6.1
pycryptodome>=3.12.0
PyYAML>=5.1
pyranges
chardet
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@
license='MIT',
packages=find_packages(),
zip_safe=False,
python_requires='>=3.6,<3.10',
python_requires='>=3.7,<3.10',
entry_points={'console_scripts': ['genie = genie.__main__:main']},
scripts=['bin/input_to_database.py',
'bin/database_to_staging.py'],
install_requires=['pandas>=1.0',
'synapseclient==2.4.0',
'httplib2>=0.11.3',
'pycrypto>=2.6.1',
'pycryptodome>=3.12.0',
'PyYAML>=5.1',
'chardet>=4.0.0'])
Loading

0 comments on commit 4cacd9b

Please sign in to comment.