Skip to content

Commit

Permalink
Merge pull request #14 from MurrellGroup/mmcifutils
Browse files Browse the repository at this point in the history
MMCIF utilities
  • Loading branch information
AntonOresten authored Oct 17, 2024
2 parents 8ba98fc + 1ca4a19 commit a2f128a
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/io/io.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using BioStructures: BioStructures, PDBFormat, MMCIFFormat
using BioStructures: BioStructures, MMCIFDict, PDBFormat, MMCIFFormat

export BioStructures, MMCIFDict, PDBFormat, MMCIFFormat

const ProteinFileFormat = Union{PDBFormat, MMCIFFormat}
const AMINOACIDS = Set("ACDEFGHIKLMNPQRSTVWY")
Expand All @@ -18,3 +20,4 @@ include("renumber.jl")
include("read.jl")
include("write.jl")
include("download.jl")
include("mmcifutils.jl")
47 changes: 47 additions & 0 deletions src/io/mmcifutils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
export getmmcif
export mapmmcif

function map_first_occurrence(u, v)
d = Dict{eltype(u),eltype(v)}()
for (x, y) in zip(u, v)
haskey(d, x) || (d[x] = y)
end
d
end

map_last_occurrence(u, v) = Dict(zip(u, v))

compose_map(d1, d2, fallback="?") = Dict(k => get(d2, v, fallback) for (k,v) in d1)

getmmcif(mmcifdict::AbstractDict{String,Vector{String}}, key::AbstractString) = get(mmcifdict, key, String[])

"""
mapmmcif(mmcifdict, field1 => field2, field3 => field4, ...)
```jldoctest
julia> import BioStructures
julia> filename BioStructures.downloadpdb("3HFM", format=BioStructures.MMCIFFormat);
[ Info: Downloading file from PDB: 3HFM
julia> mmcifdict = BioStructures.MMCIFDict(filename);
julia> mapmmcif(mmcifdict,
"_atom_site.auth_asym_id" => "_atom_site.label_entity_id",
"_entity_src_gen.entity_id" => "_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id")
Dict{String, String} with 3 entries:
"Y" => "9031"
"L" => "10090"
"H" => "10090"
```
"""
mapmmcif(mmcifdict, pairs::Pair{String,String}...) =
mapreduce(((from,to),) -> map_first_occurrence(getmmcif(mmcifdict, from), getmmcif(mmcifdict, to)), compose_map, pairs)

get_auth_asym_to_entity(mmcifdict) = mapmmcif(mmcifdict, "_atom_site.auth_asym_id" => "_atom_site.label_entity_id")

function get_auth_asym_to_taxid(mmcifdict)
mapmmcif(mmcifdict,
"_atom_site.auth_asym_id" => "_atom_site.label_entity_id",
"_entity_src_gen.entity_id" => "_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id")
end
8 changes: 8 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ using Test
@test chains[1].sequence == new_chains[1].sequence
end

@testset "mmcifutils" begin
mktempdir() do dir
structure = pdbentry("3HFM"; dir)
mmcifdict = MMCIFDict(joinpath(dir, structure.name))
@test ProteinChains.get_auth_asym_to_taxid(mmcifdict) == Dict("Y" => "9031", "L" => "10090", "H" => "10090")
end
end

# See https://proteopedia.org/wiki/index.php/Unusual_sequence_numbering
@testset "Unusual numbering" begin

Expand Down

0 comments on commit a2f128a

Please sign in to comment.