-
Notifications
You must be signed in to change notification settings - Fork 0
/
FrequentWordwithMis.py
109 lines (99 loc) · 2.28 KB
/
FrequentWordwithMis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python
import sys
input = sys.stdin.read()
tokens = input.split()
sequence=tokens[0]
k=int(tokens[1])
d=int(tokens[2])
#file=open('test.txt')
#sequence=file.read()
#sequence=input('Sequence: ')
#k, d=input('Enter k, d here: ').split()
#k=int(k)
#d=int(d)
#find all the patterns in the sequence
pattern_all=[]
unique=set()
length=len(sequence)
for i in range(0,length-k+1):
pattern=sequence[i:i+k]
pattern_all.append(pattern)
for pattern in pattern_all:
unique.add(pattern)
# create all Kmer array
array={}
dict={0:'A',1:'C', 2:'G',3:'T'}
for i in range(0, 4**k):
count=0
I=i
K=k
list=['A']*k
while K-1>=0:
if I >= 4**(K-1)*3:
list[count]='T'
I-=4**(K-1)*3
elif 4**(K-1)*2<=I<4**(K-1)*3:
list[count]='G'
I-=4**(K-1)*2
elif 4**(K-1)<=I<4**(K-1)*2:
list[count]='C'
I-=4**(K-1)
elif 0<=I<4:
list[-1]=dict[I]
count+=1
K-=1
array[''.join(list)]=i
# find d-neighbor of k-mers in the sequence
kmers=array.keys()
neighbor=set()
for pattern in unique:
for kmer in kmers:
dis=0
for i in range(0,k):
if pattern[i]!= kmer[i]:
dis+=1
if dis <=d:
neighbor.add(kmer)
# create the reverse sequence of each pattern in neighbor
dict={'A':'T', 'T':'A', 'G':'C', 'C':'G'}
pair={}
for pattern in neighbor:
pattern_r=[]
for i in range(k-1,-1,-1):
b=pattern[i]
c=dict[b]
pattern_r.append(c)
pair[pattern]=''.join(pattern_r)
pair[''.join(pattern_r)]=pattern
neighbor_r=set(pair.values())
for i in neighbor_r:
neighbor.add(i)
# find the frequence of each pattern in neighbor + frequence of reverse pattern
count_all={}
for pattern in neighbor:
count=0
count_r=0
for item in pattern_all:
dis=0
for i in range(0,k):
if pattern[i]!= item[i]:
dis+=1
if dis<=d:
count+=1
for item in pattern_all:
dis=0
for i in range(0,k):
if pair[pattern][i]!= item[i]:
dis+=1
if dis<=d:
count_r+=1
count_all[pattern]=count+count_r
if not count_all:
count_max='none'
else:
count_max=max(count_all.values())
pattern_max=[]
for pattern in neighbor:
if count_all[pattern]==count_max:
pattern_max.append(pattern)
print(' '.join(pattern_max))