diff --git a/Load/drivers.py b/Load/drivers.py index 8c0c617..ae18803 100644 --- a/Load/drivers.py +++ b/Load/drivers.py @@ -11,13 +11,19 @@ def add_driver_data(cursor, org_dict, feature_id, cvterm_id, dbxref_id, pub_id, fs_sql = """ INSERT INTO feature_synonym (synonym_id, feature_id, pub_id) VALUES (%s, %s, %s) """ feat_rel_sql = """ INSERT INTO feature_relationship (subject_id, object_id, type_id) VALUES (%s, %s, %s) RETURNING feature_relationship_id """ - count = gene_count = 400000 + # create domain + dom_name = 'DBD' + cursor.execute(feat_sql, (None, org_dict['Dmel'], 'DBD and LBD domains', 'ss-XP_DBD and LBD domains', + "", 0, cvterm_id['polypeptide'])) + feature_id[dom_name] = cursor.fetchone()[0] + + count = gene_count = 500000 for start in ['Scer\\GAL4', 'Hsap\\RELA']: # create gene - gene_count += 1 org_id = org_dict[start[:4]] sym_name = "{}".format(start) - unique_name = 'FBgn:{:07d}'.format(gene_count) + count = count + 1 + unique_name = 'FBgn{:07d}'.format(count) print("Adding gene {} {} for species {} - syn {}".format(unique_name, gene_count, start[:4], sym_name)) # create dbxref, accession -> uniquename cursor.execute(dbxref_sql, (db_id['FlyBase'], unique_name)) @@ -28,23 +34,59 @@ def add_driver_data(cursor, org_dict, feature_id, cvterm_id, dbxref_id, pub_id, feature_id[sym_name] = gene_id = cursor.fetchone()[0] for i in range(10): - # create allele/driver. + # select f.name, cvt2.name, f.uniquename, o.abbreviation, cvt.name + # from feature_relationship fr, cvterm cvt2, cvterm cvt, feature f, organism o + # where f.type_id = cvt2.cvterm_id and fr.object_id = f.feature_id and fr.type_id = cvt.cvterm_id and + # o.organism_id = f.organism_id and subject_id = 23124422; + # name | name | uniquename | abbreviation | name + # ------------------+---------------------------------+-------------+--------------+--------------------------- + # Scer\GAL4 | gene | FBgn0014445 | Scer | alleleof + # P{GAL4(DBD)-hb} | transgenic_transposable_element | FBtp0001259 | Ssss | associated_with + # pP{GAL4(DBD)-hb} | engineered_plasmid | FBmc0001249 | Ssss | gets_expression_data_from + # hb | gene | FBgn0001180 | Dmel | has_reg_region + if start == 'Scer\\GAL4': + gene_name = f'hb{i+1}' + else: + gene_name = f'pxn{i + 1}' count = count + 1 - al_sym_name = "{}[{}]".format(start, count) - unique_name = 'FBal:{:07d}'.format(count) - # create dbxref, accession -> uniquename - print("Adding allele {} {} for species {} - syn {}".format(unique_name, count, start[:4], sym_name)) + al_sym_name = "{}{}.{}".format(start, dom_name, gene_name) + al_name = "{}[{}.{}]".format(start, dom_name, gene_name) + unique_name = 'FBal{:07d}'.format(count) + print("Adding allele {} {} for species {} - syn {}".format(unique_name, count, start[:4], al_name)) cursor.execute(dbxref_sql, (db_id['FlyBase'], unique_name)) al_dbxref_id = cursor.fetchone()[0] - cursor.execute(feat_sql, (al_dbxref_id, org_id, al_sym_name, count, None, 0, cvterm_id['gene'])) - feature_id[al_sym_name] = allele_id = cursor.fetchone()[0] + cursor.execute(feat_sql, (al_dbxref_id, org_id, al_name, unique_name, None, 0, cvterm_id['allele'])) + feature_id[al_name] = cursor.fetchone()[0] + # add synonyms + cursor.execute(syn_sql, (feature_id[al_name], cvterm_id['symbol'], al_sym_name)) + syn_id = cursor.fetchone()[0] + cursor.execute(fs_sql, (syn_id, feature_id[al_name], pub_id)) + + cursor.execute(syn_sql, (feature_id[al_name], cvterm_id['symbol'], al_sym_name[5:])) # skip sp name + syn_id = cursor.fetchone()[0] + cursor.execute(fs_sql, (syn_id, feature_id[al_name], pub_id)) + # Scer\GAL4 | gene | FBgn0014445 | Scer | alleleof + cursor.execute(feat_rel_sql, (feature_id[al_name], feature_id[start], cvterm_id['alleleof'])) + + # hb | gene | FBgn0001180 | Dmel | has_reg_region + unique_name = 'FBgn{:07d}'.format(count) + print("Adding gene {} for species {} - syn {}".format(unique_name, start[:4], gene_name)) + cursor.execute(dbxref_sql, (db_id['FlyBase'], unique_name)) + gene_dbxref_id = cursor.fetchone()[0] + cursor.execute(feat_sql, (gene_dbxref_id, org_id, gene_name, unique_name, None, 0, cvterm_id['gene'])) + feature_id[gene_name] = cursor.fetchone()[0] + # add synonym + # select * from synonym where synonym_id = 6922299; + # synonym_id | name | type_id | synonym_sgml + # ------------+-------------------+---------+-------------------------- + # 6922299 | Hsap\RELA[AD.Pxn] | 59978 | Hsap\RELAAD.Pxn + cursor.execute(syn_sql, (feature_id[gene_name], cvterm_id['symbol'], gene_name)) + syn_id = cursor.fetchone()[0] + cursor.execute(fs_sql, (syn_id, feature_id[gene_name], pub_id)) + + cursor.execute(feat_rel_sql, (feature_id[al_name], feature_id[gene_name], cvterm_id['has_reg_region'])) - # add as feature relationship - cursor.execute(feat_rel_sql, (allele_id, gene_id, cvterm_id['alleleof'])) + # P{GAL4(DBD)-hb} | transgenic_transposable_element | FBtp0001259 | Ssss | associated_with - # add synonym for allele - cursor.execute(syn_sql, (al_sym_name, cvterm_id['symbol'], al_sym_name)) - symbol_id = cursor.fetchone()[0] - # add feature_synonym for allele - cursor.execute(fs_sql, (symbol_id, allele_id, pub_id)) + # pP{GAL4(DBD)-hb} | engineered_plasmid | FBmc0001249 | Ssss | gets_expression_data_from diff --git a/add-test_data.py b/add-test_data.py index 9043dee..3c7b226 100755 --- a/add-test_data.py +++ b/add-test_data.py @@ -21,6 +21,7 @@ from Load.grp import add_grp_data from Load.cell_line import add_cell_line_data from Load.aberration import add_aberration_data +from Load.drivers import add_driver_data conn = psycopg2.connect(database="fb_test") cursor = conn.cursor() @@ -522,6 +523,8 @@ def load_pub_author_pubprop(parsed_yaml): # Disease Implicated Variants (DIV) add_div_data(cursor, organism_id, cv_cvterm_id, feature_id, pub_id, db_dbxref) +# add drivers +add_driver_data(cursor, organism_id, feature_id, cvterm_id, dbxref_id, pub_id, db_id) # add chromosome_structure_variation print("Adding chromosome_structure_variation data.") diff --git a/data/cv_cvterm.yaml b/data/cv_cvterm.yaml index 892ee3b..d4601c2 100644 --- a/data/cv_cvterm.yaml +++ b/data/cv_cvterm.yaml @@ -8,10 +8,13 @@ SO: ['chromosome_arm', 'chromosome', 'gene', 'mRNA', 'DNA', 'golden_path', 'ncRN 'natural population', 'cloned_region', 'engineered_region', 'transgenic_transposable_element', 'transposable_element_insertion_site', 'chromosome_band', 'allele', 'transposable_element', 'natural_transposable_element', 'gene_group', 'polypeptide', 'chromosome_breakpoint', 'engineered_plasmid', 'sgRNA', - 'oligo', 'engineered_foreign_gene', 'point_mutation', 'cDNA_clone', 'TSS', 'rescue_region', 'insertion_site'] + 'oligo', 'engineered_foreign_gene', 'point_mutation', 'cDNA_clone', 'TSS', 'rescue_region', 'insertion_site', 'synthetic_sequence'] molecular_function: ['mRNA binding'] -cellular_component: ['nucleolus', 'something' ,'extracellular space'] +cellular_component: ['nucleolus', 'something' ,'extracellular space', 'endoplasmic reticulum'] biological_process: ['activation of immune response', 'defense response to other organism', 'rRNA processing'] +FlyBase anatomy CV: ['embryo','dopaminergic PAM neuron 1', 'dopaminergic PAM neuron 5', 'dissociated larval fat cell', + 'embryonic/larval hemolymph', + 'anatomy 1', 'anatomy 2', 'anatomy 3', 'mesoderm'] ####### End of order matters cv/cvterms cell_line_cvtermprop type: ['basis'] @@ -24,8 +27,8 @@ CHEBI: [] DOID: [] disease_ontology: ['hh-1'] -experimental assays: ['distribution deduced from reporter (Gal4 UAS)'] -expression slots: ['stage', 'anatomy', 'assay'] +experimental assays: ['distribution deduced from reporter (Gal4 UAS)', 'in situ'] +expression slots: ['stage', 'anatomy', 'assay', 'cellular'] feature_cvtermprop type: [ 'wt_class', 'aberr_class', 'tool_uses', 'transgene_uses property', @@ -36,9 +39,7 @@ feature_relationshipprop type: ['fly_disease-implication_change', 'comment', 're FlyBase: ['FlyBase analysis'] FlyBase_internal: ['pubprop type:curated_by'] -FlyBase anatomy CV: ['embryo','dopaminergic PAM neuron 1', 'dopaminergic PAM neuron 5', - 'embryonic/larval hemolymph', - 'anatomy 1', 'anatomy 2', 'anatomy 3'] + FlyBase development CV: ['late embryonic stage', 'embryonic stage', 'adult stage', 'development 1', 'development 2', 'development 3', 'wandering third instar larval stage'] FlyBase miscellaneous CV: [ @@ -52,6 +53,7 @@ FlyBase miscellaneous CV: [ 'pheno1', 'pheno2', 'pheno3', 'pheno4', 'pheno5', 'photoactivatable fluorescent protein', 'protein detection tool', 'project', 'qualifier', 'reagent collection', 'RNA detection tool', 'single balancer', 'spontaneous', + 'split system combination', 'transcriptome', 'umbrella project', 'unspecified'] GenBank feature qualifier: [ @@ -83,7 +85,7 @@ property type: [ 'GO_internal_notes', 'GO_review_date', 'HDM_comment', 'OMIM_pheno_table', 'PCR_template', 'aberr_relationships', 'additional_disease-implication_change', 'allele_report_comment', - 'aminoacid_rep', 'availability','balancer_status', 'bodypart_expression_marker', 'bound_moiety_comment', 'breeding_comment', 'category', 'cellular_description', + 'aminoacid_rep', 'availability','balancer_status', 'bodypart_expression_text', 'bodypart_expression_marker', 'bound_moiety_comment', 'breeding_comment', 'category', 'cellular_description', 'comment', 'complementation', 'cyto_change_comment', 'cyto_loc_comment', 'curated_phenotype', 'data_analysis', 'data_type', 'data_link', 'data_link_bdsc', 'deliberate_omission', 'deleted_segment', 'derived_cyto_location', 'description', 'diopt_ortholog', 'discoverer', 'disease_associated', 'div_comment', @@ -131,7 +133,7 @@ relationship type: [ 'identified_with', 'maps_to_clone', 'member_gene_of', 'molec_deletes', 'molec_dups', 'molec_nondeletes', 'molec_nondups', 'molec_partdeletes', 'molec_partdups', 'nomaps_to_clone', 'nondeletes', 'nonduplicates', 'originates_from', - 'overlap_inferred', 'parent_grp', 'part_deletes', 'part_duplicates', 'partof', + 'overlap_inferred', 'partially_produced_by', 'parent_grp', 'part_deletes', 'part_duplicates', 'partof', 'primer_progenitor_of', 'producedby', 'progenitor', 'replacement_descendant_of', 'representative_isoform', 'recom_right_end', 'recom_left_end', 'related_tool', 'segregant_of', 'tagged_with', 'undefined_grp'] diff --git a/triggers/multiple_seqs.sql b/triggers/multiple_seqs.sql index 406a2ee..68a3acc 100644 --- a/triggers/multiple_seqs.sql +++ b/triggers/multiple_seqs.sql @@ -16,7 +16,7 @@ $bob$ LANGUAGE plpgsql; SELECT make_fb_seqs(ARRAY['al', 'ti', 'tp', 'te', 'mc', 'ms', 'ba', 'ab', 'gn', 'tr', 'pp', 'og', - 'cl', 'gg', 'hh', 'ig', 'lc', 'rf', 'sf', 'sn', 'st', 'tc', 'to', 'ch']); + 'cl', 'gg', 'hh', 'ig', 'lc', 'rf', 'sf', 'sn', 'st', 'tc', 'to', 'ch', 'co']); CREATE OR REPLACE FUNCTION public.feature_assignname_fn_i() RETURNS trigger