diff --git a/src/io/io.jl b/src/io/io.jl index 9e42e96..5f606a8 100644 --- a/src/io/io.jl +++ b/src/io/io.jl @@ -1,4 +1,6 @@ -using BioStructures: BioStructures, PDBFormat, MMCIFFormat +using BioStructures: BioStructures, MMCIFDict, PDBFormat, MMCIFFormat + +export BioStructures, MMCIFDict, PDBFormat, MMCIFFormat const ProteinFileFormat = Union{PDBFormat, MMCIFFormat} const AMINOACIDS = Set("ACDEFGHIKLMNPQRSTVWY") @@ -18,3 +20,4 @@ include("renumber.jl") include("read.jl") include("write.jl") include("download.jl") +include("mmcifutils.jl") diff --git a/src/io/mmcifutils.jl b/src/io/mmcifutils.jl new file mode 100644 index 0000000..f95f8f6 --- /dev/null +++ b/src/io/mmcifutils.jl @@ -0,0 +1,47 @@ +export getmmcif +export mapmmcif + +function map_first_occurrence(u, v) + d = Dict{eltype(u),eltype(v)}() + for (x, y) in zip(u, v) + haskey(d, x) || (d[x] = y) + end + d +end + +map_last_occurrence(u, v) = Dict(zip(u, v)) + +compose_map(d1, d2, fallback="?") = Dict(k => get(d2, v, fallback) for (k,v) in d1) + +getmmcif(mmcifdict::AbstractDict{String,Vector{String}}, key::AbstractString) = get(mmcifdict, key, String[]) + +""" + mapmmcif(mmcifdict, field1 => field2, field3 => field4, ...) + +```jldoctest +julia> import BioStructures + +julia> filename BioStructures.downloadpdb("3HFM", format=BioStructures.MMCIFFormat); +[ Info: Downloading file from PDB: 3HFM + +julia> mmcifdict = BioStructures.MMCIFDict(filename); + +julia> mapmmcif(mmcifdict, + "_atom_site.auth_asym_id" => "_atom_site.label_entity_id", + "_entity_src_gen.entity_id" => "_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id") +Dict{String, String} with 3 entries: + "Y" => "9031" + "L" => "10090" + "H" => "10090" +``` +""" +mapmmcif(mmcifdict, pairs::Pair{String,String}...) = + mapreduce(((from,to),) -> map_first_occurrence(getmmcif(mmcifdict, from), getmmcif(mmcifdict, to)), compose_map, pairs) + +get_auth_asym_to_entity(mmcifdict) = mapmmcif(mmcifdict, "_atom_site.auth_asym_id" => "_atom_site.label_entity_id") + +function get_auth_asym_to_taxid(mmcifdict) + mapmmcif(mmcifdict, + "_atom_site.auth_asym_id" => "_atom_site.label_entity_id", + "_entity_src_gen.entity_id" => "_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id") +end diff --git a/test/runtests.jl b/test/runtests.jl index c387142..51c1561 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -81,6 +81,14 @@ using Test @test chains[1].sequence == new_chains[1].sequence end + @testset "mmcifutils" begin + mktempdir() do dir + structure = pdbentry("3HFM"; dir) + mmcifdict = MMCIFDict(joinpath(dir, structure.name)) + @test ProteinChains.get_auth_asym_to_taxid(mmcifdict) == Dict("Y" => "9031", "L" => "10090", "H" => "10090") + end + end + # See https://proteopedia.org/wiki/index.php/Unusual_sequence_numbering @testset "Unusual numbering" begin