-
Notifications
You must be signed in to change notification settings - Fork 1
/
nlptoarchfinal.py
234 lines (186 loc) · 6.88 KB
/
nlptoarchfinal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# -*- coding: utf-8 -*-
"""NLPtoArchFinal.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Az7ps0L9EoLeAJF3sbGEQdi3UK0ULmvd
"""
import nltk
import re
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
stopwords_remove=['from','to','and','it','with']
for i in stopwords_remove:
stop_words.remove(i)
stopwords_add=['The']
for i in stopwords_add:
stop_words.add(i)
# stop_words.remove('of')
from nltk.stem.snowball import SnowballStemmer
ps = SnowballStemmer("english")
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
input_paragraph = "The coffeemaker consists of a brewer and a pot. The pot consists of a glassware, a lid, and a handle. The brewer is composed of a tank, a heating unit, a brewing unit, and a dispensing unit. The heating unit consists of a heating coil, a hot water pipe, and a water valve. The heating coil imports electricity and converts it into heat. The heating coil transfers heat to the water pipe. The water pipe imports water and heat and energizes water with heat. The vertical pipe transfers hot water from the water pipe to the shower head. The brewing unit consists of a shower head, a filter, and a filter holder. The shower head imports hot water from the vertical pipe and distributes it. The filter imports ground coffee and hot water from the shower head, couples these two flows,and exports liquid coffee. The filter separates liquid coffee from the ground coffee and exports it to the glassware."
energy_list=['heat','electricity','water','steam','kinetic','chemical','electrical','mechanical','thermal','nuclear','gravitational', 'hot']
def remove_stopwords_from_raw_text(text):
try:
words = nltk.word_tokenize(text)
return remove_stopwords_from_tokenized_text(words)
except Exception as e:
print(str(e))
def remove_stopwords_from_tokenized_text(words):
try:
output_list = []
for w in words:
if w not in stop_words:
output_list.append(w)
return output_list
except Exception as e:
print(str(e))
def tag_pos_of_raw_text(text):
try:
words = nltk.word_tokenize(text)
return nltk.pos_tag(words)
except Exception as e:
print(str(e))
#Removing stopwords
final_list= remove_stopwords_from_raw_text(input_paragraph)
print("\n\n\n\nAfter removing stopwords = " , final_list)
# Converting words not in nltk synonym list to the ones that are accepted
index=0
for i in final_list:
if ((i.lower()=="supply".lower())):
final_list[index]="transfer"
if i.lower()=="incorporate".lower() or i.lower()=="composed".lower() :
final_list[index]="consists"
if i.lower()=="accomodates".lower() :
final_list[index]="contains"
if i.lower()=="tranform".lower() :
final_list[index]="converts"
index=index+1
new_final_list=[]
new_para=""
for ele in final_list:
new_para= new_para +" " +ele
new_para_latest=""
carry_on=""
new_list=new_para.split(".")
for i in new_list:
if "consists" in i:
carry_on=""
within_words=i.split()
for words in within_words:
if (words=="and" or words=="consists" or words==","):
new_para_latest=new_para_latest[:-1]+" "+words+" "
else:
new_para_latest=new_para_latest+words.upper()+"_"
carry_on=words
new_para_latest=new_para_latest[:-1]+" . "
else:
if (("imports" in i) or ("couples" in i) or ("transfers" in i)):
within_words=i.split()
index_func=0
index_from=0
index_to=0
for m in range(len(within_words)):
if (within_words[m]=="imports" or within_words[m]=="transfers" or within_words[m]=="couples"):
index_func=m
elif (within_words[m]=="from"):
index_from=m
elif (within_words[m]=="to"):
index_to=m
else:
continue
m=0
while(m<len(within_words)):
if (index_func>0 and m==0):
for x in range(index_func):
new_para_latest=new_para_latest+within_words[m].upper()+"_"
m=m+1
new_para_latest=new_para_latest[:-1]
elif(index_from>0 and index_to>0 and m==index_from):
new_para_latest=new_para_latest+" from"+" "
m=m+1
for x in range(index_from+1,index_to):
new_para_latest=new_para_latest+within_words[m].upper()+"_"
m=m+1
new_para_latest=new_para_latest[:-1]+" to "
m=m+1
for x in range(index_to+1,len(within_words)):
new_para_latest=new_para_latest+within_words[m].upper()+"_"
m=m+1
new_para_latest=new_para_latest[:-1]
elif (index_from>0 and i==index_from and m==index_from):
new_para_latest=new_para_latest+" from "
m=m+1
for x in range(index_from+1,len(within_words)):
new_para_latest=new_para_latest+within_words[m].upper()+"_"
m=m+1
new_para_latest=new_para_latest[:-1]
elif(index_from==0 and index_to>0 and m==index_to):
new_para_latest=new_para_latest+" to "
m=m+1
for x in range(index_to+1,len(within_words)):
new_para_latest=new_para_latest+within_words[m].upper()+"_"
m=m+1
new_para_latest=new_para_latest[:-1]
else:
new_para_latest=new_para_latest+" "+within_words[m]
m=m+1
new_para_latest=new_para_latest+" . "
print("\n\n Latest para = ", new_para_latest)
var= False
exchange_word='';
def convert_synonyms(word) :
try :
stemmed_word=ps.stem(word)
if ((word=='.') or (word==',')):
return word
new_word='"' + stemmed_word + '"'
# print(new_word)
for syn in wordnet.synsets(new_word):
for lm in syn.lemmas():
for x in word_accept :
if (lm.name()).lower()==x.lower():
var=True;
exchange_word=x
if var==True :
return exchange_word
var=False
else :
return word
except Exception as e:
word.upper()
#Converting the words into synonym if there in nltk library
for item in word_tokenize(new_para_latest):
var=False
for i in energy_list:
if i.lower()==item.lower():
new_final_list.append(i)
var=True
break
if var==True:
continue
else:
new_item= convert_synonyms(item)
if(new_item==None):
new_final_list.append(item)
else:
new_final_list.append(new_item)
# #Printing list
print("\n\n\n List after processing synonyms = " , new_final_list)
str=''
for element in new_final_list:
str=str+" "+element
print(str)
# textfile = open("NLP_Output.txt", "w")
# for element in final_list:
# textfile.write(element + "\n")
# textfile.close()
# from google.colab import files
# files.download('NLP_Output.txt')