-
Notifications
You must be signed in to change notification settings - Fork 1
/
mk_seq_profile.go
213 lines (188 loc) · 6.19 KB
/
mk_seq_profile.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
package main
import (
"flag"
"sync"
"github.com/TuftsBCB/fragbag"
"github.com/TuftsBCB/io/pdb"
"github.com/TuftsBCB/seq"
"github.com/TuftsBCB/structure"
"github.com/TuftsBCB/tools/util"
)
var cmdMkSeqProfile = &command{
name: "mk-seq-profile",
positionalUsage: "struct-frag-lib out-frag-lib " +
"pdb-chain-file [ pdb-chain-file ... ]",
shortHelp: "create a new sequence fragment library with profiles",
help: `
The mk-seq-profile command builds a sequence fragment library based on the
information from a structure fragment library and a set of PDB structures
to train on. The resulting library is a collection of fragments represented
as frequency profiles expressed as negative log odds scores. The null model
is built from the amino acid composition over all PDB chains given.
The algorithm for building a sequence fragment library is as follows:
1. Initialize an empty frequency profile for each structure fragment in the
library given.
2. For every window in every PDB chain given, find the best matching
structure fragment from the library provided.
3. Add the corresponding region of sequence to that fragment's frequency
profile. Also, update the null model for each amino acid seen.
4. After all PDB chains are processed, build a profile in terms of negative
log-odds using the null model constructed.
5. Each profile corresponds to a fragment in the resulting sequence
fragment library.
This process directly implies that the sequence fragment library produced will
have the same number of fragments and the same fragment size as the structure
fragment library given.
`,
flags: flag.NewFlagSet("mk-seq-profile", flag.ExitOnError),
run: mkSeqProfile,
addFlags: func(c *command) { c.setOverwriteFlag() },
}
var (
// There are two concurrent aspects going on here:
// 1) processing entire PDB chains
// 2) adding each part of each chain to a sequence fragment.
// So we use two waitgroups: one for synchronizing on finishing
// (1) and the other for synchronizing on finishing (2).
wgPDBChains = new(sync.WaitGroup)
wgSeqFragments = new(sync.WaitGroup)
)
func mkSeqProfile(c *command) {
c.assertLeastNArg(3)
structLib := util.StructureLibrary(c.flags.Arg(0))
outPath := c.flags.Arg(1)
entries := c.flags.Args()[2:]
util.AssertOverwritable(outPath, flagOverwrite)
saveto := util.CreateFile(outPath)
// Initialize a frequency and null profile for each structural fragment.
var freqProfiles []*seq.FrequencyProfile
var fpChans []chan seq.Sequence
for i := 0; i < structLib.Size(); i++ {
fp := seq.NewFrequencyProfile(structLib.FragmentSize())
freqProfiles = append(freqProfiles, fp)
fpChans = append(fpChans, make(chan seq.Sequence))
}
// Now spin up a goroutine for each fragment that is responsible for
// adding a sequence slice to itself.
nullChan, nullProfile := addToNull()
for i := 0; i < structLib.Size(); i++ {
addToProfile(fpChans[i], freqProfiles[i])
}
// Create a channel that sends the PDB entries given.
entryChan := make(chan string)
go func() {
for _, fp := range entries {
entryChan <- fp
}
close(entryChan)
}()
progress := util.NewProgress(len(entries))
for i := 0; i < flagCpu; i++ {
wgPDBChains.Add(1)
go func() {
for entryPath := range entryChan {
_, chains, err := util.PDBOpen(entryPath)
progress.JobDone(err)
if err != nil {
continue
}
for _, chain := range chains {
structureToSequence(structLib, chain, nullChan, fpChans)
}
}
wgPDBChains.Done()
}()
}
wgPDBChains.Wait()
progress.Close()
// We've finishing reading all the PDB inputs. Now close the channels
// and let the sequence fragments finish.
close(nullChan)
for i := 0; i < structLib.Size(); i++ {
close(fpChans[i])
}
wgSeqFragments.Wait()
// Finally, add the sequence fragments to a new sequence fragment
// library and save.
profs := make([]*seq.Profile, structLib.Size())
for i := 0; i < structLib.Size(); i++ {
profs[i] = freqProfiles[i].Profile(nullProfile)
}
lib, err := fragbag.NewSequenceProfile(structLib.Name(), profs)
util.Assert(err)
util.Assert(fragbag.Save(saveto, lib))
}
// structureToSequence uses structural fragments to categorize a segment
// of alpha-carbon atoms, and adds the corresponding residues to a
// corresponding sequence fragment.
func structureToSequence(
lib fragbag.StructureLibrary,
chain *pdb.Chain,
nullChan chan seq.Sequence,
seqChans []chan seq.Sequence,
) {
sequence := chain.AsSequence()
fragSize := lib.FragmentSize()
// If the chain is shorter than the fragment size, we can do nothing
// with it.
if sequence.Len() < fragSize {
util.Verbosef("Sequence '%s' is too short (length: %d)",
sequence.Name, sequence.Len())
return
}
// If we're accumulating a null model, add this sequence to it.
if nullChan != nil {
nullChan <- sequence
}
// This bit of trickery here is all about getting the call to
// SequenceCaAtoms outside of the loop. In particular, it's a very
// expensive call since it has to reconcile inconsistencies between
// SEQRES and ATOM records in PDB files.
limit := sequence.Len() - fragSize
atoms := chain.SequenceCaAtoms()
atomSlice := make([]structure.Coords, fragSize)
noGaps := func(atoms []*structure.Coords) []structure.Coords {
for i, atom := range atoms {
if atom == nil {
return nil
}
atomSlice[i] = *atom
}
return atomSlice
}
for start := 0; start <= limit; start++ {
end := start + fragSize
cas := noGaps(atoms[start:end])
if cas == nil {
// Nothing contiguous was found (a "disordered" residue perhaps).
// So skip this part of the chain.
continue
}
bestFrag := lib.BestStructureFragment(atomSlice)
sliced := sequence.Slice(start, end)
seqChans[bestFrag] <- sliced
}
}
func addToProfile(sequences chan seq.Sequence, fp *seq.FrequencyProfile) {
wgSeqFragments.Add(1)
go func() {
for s := range sequences {
fp.Add(s)
}
wgSeqFragments.Done()
}()
}
func addToNull() (chan seq.Sequence, *seq.FrequencyProfile) {
nullChan := make(chan seq.Sequence, 100)
nullProfile := seq.NewNullProfile()
wgSeqFragments.Add(1)
go func() {
for s := range nullChan {
for i := 0; i < s.Len(); i++ {
nullProfile.Add(s.Slice(i, i+1))
}
}
wgSeqFragments.Done()
}()
return nullChan, nullProfile
}