-
Notifications
You must be signed in to change notification settings - Fork 0
/
build-pdb-hhm-db
executable file
·130 lines (111 loc) · 2.66 KB
/
build-pdb-hhm-db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/bin/sh
set -e
function num_cpus {
if [ -r /proc/cpuinfo ]; then
NPROC=$(cat /proc/cpuinfo | grep '^processor' | wc -l | tr -d ' ')
else
NPROC=1
fi
echo $NPROC
}
function msg {
echo -e $* >&2
}
function usage {
msg "Usage: `basename $0` [flags] new-db-name hhsuite-db pdb-dir"
msg
msg "--help"
msg "\tShow this help message."
msg "--cpu num-cpus"
msg "\tThe number of CPUs for HHsuite to use."
exit 1
}
num_cpus=`num_cpus`
while true; do
case "$1" in
-cpu|--cpu)
num_cpus=$2
shift 2
;;
-h|-help|--help)
usage
;;
-*|--*)
msg "Invalid flag $1."
msg
usage
;;
*)
break
;;
esac
done
if [ $# != 3 ]; then
usage
fi
pdb_hhm_db="$1" # The new DB we're creating.
hhsuite_db="$2" # The DB we're using to generate MSAs.
pdb_dir="$3" # The directory containing the PDB files.
transient="$pdb_dir/transient" # A temp dir to store fasta/a3m/hhm files.
if [ -d "$pdb_hhm_db" ]; then
msg "$pdb_hhm_db already exists. Quitting..."
exit 1
fi
mkdir -p $pdb_hhm_db/pdb
rm -rf "$transient"
mkdir "$transient"
msg "Generating a FASTA file for each PDB file..."
for f in "$pdb_dir"/*.pdb "$pdb_dir"/*.ent "$pdb_dir"/*.ent.gz; do
if [ ! -f "$f" ]; then
continue
fi
basef=$(basename "$f")
name=""
if [[ "$f" = *.pdb ]]; then
name="${basef%*.pdb}"
elif [[ "$f" = *.ent ]]; then
name="${basef%*.ent}"
elif [[ "$f" = *.ent.gz ]]; then
name="${basef%*.ent.gz}"
else
msg "Unknown PDB extension: $f"
exit 1
fi
pdb2fasta --chain ${name:4} "$f" "$transient/$name.fasta" 2>&1
done
msg "Copying PDB files..."
for f in "$pdb_dir"/*.{pdb,ent,ent.gz}; do
if [ ! -f "$f" ]; then
continue
fi
cp -a "$f" "$pdb_hhm_db"/pdb/
done
msg "Building a multiple-sequence alignment for each sequence."
msg "This may take a while. Go grab a drink."
glob="$transient"/'*.fasta'
multithread.pl \
"$glob" \
'hhblits -i $file -d '"$hhsuite_db"' -n 2 -mact 0.35 -oa3m $name.a3m' \
--cpu $num_cpus 2>&1
msg "Performing secondary structure annotation on each MSA."
glob="$transient"/'*.a3m'
multithread.pl \
"$glob" \
'addss.pl $file' \
--cpu $num_cpus 2>&1
msg "Cleaning up messes made by hhsuite's 'addss.pl' script."
for f in "$transient"/*.a3m; do
clean-a3m "$f"
done
msg "Generating an HMM for each MSA."
glob="$transient"/'*.a3m'
multithread.pl \
"$glob" \
'hhmake -i $file -pcm 4 -pca 2.5 -pcb 0.5 -pcc 1.0 -gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6' \
--cpu $num_cpus 2>&1
msg "Building the HHblits database."
hhblitsdb.pl \
-o "$pdb_hhm_db/$(basename "$pdb_hhm_db")" \
-ia3m "$transient" \
-ihhm "$transient" \
-cpu $num_cpus