-
Notifications
You must be signed in to change notification settings - Fork 0
/
rmpCrawler.java
205 lines (126 loc) · 7.16 KB
/
rmpCrawler.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import java.util.List;
import java.util.regex.Pattern;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
public class rmpCrawler extends WebCrawler{
StringBuffer builder = new StringBuffer();
daocrawler dao = new daocrawler();
Professor prof = new Professor();
private static final Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
+ "|png|tiff?|mid|mp2|mp3|mp4"
+ "|wav|avi|mov|mpeg|ram|m4v|pdf"
+ "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ratemyprofessors.com/ShowRatings.jsp?tid=~");
}
public void visit(Page page){
String urlcrawler = page.getWebURL().getURL();
System.out.println(" URL to be crawled " + urlcrawler);
if(page.getParseData() instanceof HtmlParseData){
HtmlParseData htmlParseData = (HtmlParseData)page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
String classname = htmlParseData.getTitle();
List<WebURL> links = htmlParseData.getOutgoingUrls();
//Removing white spaces
// text = text.replaceAll("\\s+"," ");
//Each and every Rating seperated by ~
text= text.replaceAll("Report this rating","~~~");
//Removing Headers
//int startpoint = text.indexOf();
//Removing Footers
int startpoint = text.indexOf("HomeAboutTop ListsProfessors Strike Back",0);
System.out.println(" index of :" + text.indexOf("HomeAboutTop ListsProfessors Strike Back",0));
//String contains the body of the ratings
String mainbody = text.substring(startpoint+"HomeAboutTop ListsProfessors Strike Back".length(), text.length
());
/* builder.append(mainbody.trim());
try {
BufferedWriter out = new BufferedWriter(new FileWriter("C:/data/content.txt"));
out.write(builder.toString());
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}*/
Professor detailsprof = parseProfessorHeaderDetails(mainbody);
parseProfessorComments(mainbody,detailsprof);
// This is code is for comments and date extraction.
/*
/* System.out.println("Text contents: " + text + "\n");
System.out.println("Html length: " + html.length());
System.out.println("Html length: " + classname.length());
System.out.println("Number of outgoing links: " + links.size());
*/
}
}
//Parsing main text body and taking only the header content
private Professor parseProfessorHeaderDetails(String headerDetails){
int countryDelimiterStartIndex = headerDetails.indexOf("»", 0);
if(headerDetails.indexOf("»", countryDelimiterStartIndex+1)!=-1){
int stateDelimiterStartIndex = headerDetails.indexOf("»", countryDelimiterStartIndex+1);
if( headerDetails.indexOf("»", stateDelimiterStartIndex+1)!=-1){
int schoolDelimiterStartIndex = headerDetails.indexOf("»", stateDelimiterStartIndex+1);
if(headerDetails.indexOf("»", schoolDelimiterStartIndex+1)!=-1){
int professorDelimiterStartIndex = headerDetails.indexOf("»", schoolDelimiterStartIndex+1);
int cityLocationDetails = headerDetails.indexOf(":", professorDelimiterStartIndex+1);
String Country = headerDetails.substring(countryDelimiterStartIndex+1 ,headerDetails.indexOf("»",
countryDelimiterStartIndex+1)).trim();
String State = headerDetails.substring(stateDelimiterStartIndex+1, headerDetails.indexOf("»",
stateDelimiterStartIndex+1)).trim();
String SchoolName = headerDetails.substring(schoolDelimiterStartIndex+1, headerDetails.indexOf("»",
schoolDelimiterStartIndex+1)).trim();
String ProfessorNameUnTrimmed = headerDetails.substring(professorDelimiterStartIndex+1,
headerDetails.indexOf(":", professorDelimiterStartIndex+1));
int citydemiliter = headerDetails.indexOf("Location:",0);
int departmentdemiliter = headerDetails.indexOf("Department:",citydemiliter);
// if(":".equals(headerDetails.charAt(citydemiliter+"Locatio".length()))){
String cityLocatedIn = headerDetails.substring(citydemiliter+"Location:".length(),
headerDetails.indexOf(",", citydemiliter+"Location:".length())).trim();
int SchoolDelimiter = ProfessorNameUnTrimmed.indexOf("School");
String ProfessorUnEditedNameDetails = ProfessorNameUnTrimmed.substring(0, SchoolDelimiter-1);
String OriginatedProfessorName = ProfessorUnEditedNameDetails.substring(0, Math.round
(ProfessorUnEditedNameDetails.length()/2)-1).trim();
String departmentName = headerDetails.substring(departmentdemiliter+"Department:".length(),
headerDetails.indexOf(".",departmentdemiliter+"Department:".length())-1).trim();
// System.out.println("Country is =-----------------> " + Country + State + SchoolName +
"OriginatedProfessorName--------->" +OriginatedProfessorName + "City Located In ---------> " +cityLocatedIn + " DEPARTMENT NAME
---------------> " + departmentName);
dao.insertProfessorDetails
(Country,State,SchoolName,cityLocatedIn,OriginatedProfessorName,departmentName);
prof.setOriginatedProfessorName(OriginatedProfessorName);
prof.setDepartmentName(departmentName);
prof.setSchoolName(SchoolName);
}}}
return prof;
}
private void parseProfessorComments(String mainbody, Professor profdetails){
int insertloopexit=0;
int mainBodyLength = mainbody.length();
String comments = mainbody.substring(mainbody.indexOf("UserCommentsandRatings",0)+"UserCommentsandRatings".length
()+"ProfessorFeedbackDateClassRatingComment".length(), mainbody.indexOf("~~~",mainbody.indexOf
("UserCommentsandRatings",0)+"ProfessorFeedbackDateClassRatingComment".length()));
if(mainbody.indexOf("~~~",0)!=-1){
String bcd = mainbody.substring(mainbody.indexOf(comments) + comments.length(), mainbody.indexOf("~~~",mainbody.indexOf(comments) +
comments.length()+4));
mainbody= mainbody.substring(mainbody.indexOf(bcd) + bcd.length(), mainBodyLength);
while(mainbody.indexOf("~~~",0)!=-1){
String second = mainbody.substring(3,mainbody.indexOf("~~~", 4));
String trimmedsecond = second.indexOf("Rater Interest",0)!= -1 ? second.substring(second.indexOf("Rater Interest",0)+"Rater
Interest".length()+3,second.length()): second.substring(second.indexOf("Clarity",0)+"Clarity".length()+3,second.length()) ;
if(trimmedsecond != null && !(trimmedsecond.indexOf("No comments")!=-1) && !(trimmedsecond.indexOf("Easiness")!=-1) && !
(trimmedsecond.indexOf("Clarity")!=-1) && !(trimmedsecond.indexOf("Rater Interest")!=-1) && !(trimmedsecond.indexOf("Helpfulness")!
=-1)){
dao.insertProfessorComments(trimmedsecond,profdetails);
}
mainbody = mainbody.substring(mainbody.indexOf(second)+ second.length(),mainbody.length());
insertloopexit++;
if(insertloopexit == 6) break;
}
}
}
}