From ce10b2db7c067b16cc3a683073a175fbf901ff4e Mon Sep 17 00:00:00 2001 From: anton083 Date: Tue, 15 Oct 2024 17:15:11 +0200 Subject: [PATCH 1/2] Add mmcifutils.jl --- src/io/io.jl | 1 + src/io/mmcifutils.jl | 47 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 src/io/mmcifutils.jl diff --git a/src/io/io.jl b/src/io/io.jl index 9e42e96..91092be 100644 --- a/src/io/io.jl +++ b/src/io/io.jl @@ -18,3 +18,4 @@ include("renumber.jl") include("read.jl") include("write.jl") include("download.jl") +include("mmcifutils.jl") diff --git a/src/io/mmcifutils.jl b/src/io/mmcifutils.jl new file mode 100644 index 0000000..f95f8f6 --- /dev/null +++ b/src/io/mmcifutils.jl @@ -0,0 +1,47 @@ +export getmmcif +export mapmmcif + +function map_first_occurrence(u, v) + d = Dict{eltype(u),eltype(v)}() + for (x, y) in zip(u, v) + haskey(d, x) || (d[x] = y) + end + d +end + +map_last_occurrence(u, v) = Dict(zip(u, v)) + +compose_map(d1, d2, fallback="?") = Dict(k => get(d2, v, fallback) for (k,v) in d1) + +getmmcif(mmcifdict::AbstractDict{String,Vector{String}}, key::AbstractString) = get(mmcifdict, key, String[]) + +""" + mapmmcif(mmcifdict, field1 => field2, field3 => field4, ...) + +```jldoctest +julia> import BioStructures + +julia> filename BioStructures.downloadpdb("3HFM", format=BioStructures.MMCIFFormat); +[ Info: Downloading file from PDB: 3HFM + +julia> mmcifdict = BioStructures.MMCIFDict(filename); + +julia> mapmmcif(mmcifdict, + "_atom_site.auth_asym_id" => "_atom_site.label_entity_id", + "_entity_src_gen.entity_id" => "_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id") +Dict{String, String} with 3 entries: + "Y" => "9031" + "L" => "10090" + "H" => "10090" +``` +""" +mapmmcif(mmcifdict, pairs::Pair{String,String}...) = + mapreduce(((from,to),) -> map_first_occurrence(getmmcif(mmcifdict, from), getmmcif(mmcifdict, to)), compose_map, pairs) + +get_auth_asym_to_entity(mmcifdict) = mapmmcif(mmcifdict, "_atom_site.auth_asym_id" => "_atom_site.label_entity_id") + +function get_auth_asym_to_taxid(mmcifdict) + mapmmcif(mmcifdict, + "_atom_site.auth_asym_id" => "_atom_site.label_entity_id", + "_entity_src_gen.entity_id" => "_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id") +end From c61fc99239430c620360bd8ab0ee4940b3f404f5 Mon Sep 17 00:00:00 2001 From: anton083 Date: Thu, 17 Oct 2024 17:02:24 +0200 Subject: [PATCH 2/2] Add test, export BioStructures --- src/io/io.jl | 4 +++- test/runtests.jl | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/io/io.jl b/src/io/io.jl index 91092be..5f606a8 100644 --- a/src/io/io.jl +++ b/src/io/io.jl @@ -1,4 +1,6 @@ -using BioStructures: BioStructures, PDBFormat, MMCIFFormat +using BioStructures: BioStructures, MMCIFDict, PDBFormat, MMCIFFormat + +export BioStructures, MMCIFDict, PDBFormat, MMCIFFormat const ProteinFileFormat = Union{PDBFormat, MMCIFFormat} const AMINOACIDS = Set("ACDEFGHIKLMNPQRSTVWY") diff --git a/test/runtests.jl b/test/runtests.jl index 3a92eab..4c00c4e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -77,6 +77,14 @@ using Test @test chains[1].sequence == new_chains[1].sequence end + @testset "mmcifutils" begin + mktempdir() do dir + structure = pdbentry("3HFM"; dir) + mmcifdict = MMCIFDict(joinpath(dir, structure.name)) + @test ProteinChains.get_auth_asym_to_taxid(mmcifdict) == Dict("Y" => "9031", "L" => "10090", "H" => "10090") + end + end + end @testset "store" begin