-
Notifications
You must be signed in to change notification settings - Fork 3
/
scrape.js
67 lines (59 loc) · 2.15 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
'use strict';
function scrape(html) {
var arr = []
, rows
;
if (/Sorry, no records found/.test(html)) {
return null;
}
// there's tons of malformed html, let's level the playing field
html = html.replace(/[\s\S]+(<TABLE[\s\S]+?TABLE>)[\s\S]+/g, '$1');
html = html.replace(/<TH[\s\S]+?<TR>/g, '<TR>');
rows = html.split(/<TR>/ig);
rows.shift(); // TABLE open + header
rows.pop(); // TABLE close
rows.forEach(function (row) {
var cols = row.split(/<TD>/ig)
, linkHrefRe = /<A.*?HREF='([^']*)'.*/
, linkTitleRe = /<A[^>]*>([^<]*)(<\/A>)?/
, area
, prefix
, city
, state
, st
, company
, type
, carrier
, carrierLink
, carrierName2
;
cols.shift();
// <A HREF='findareacode.php?areacode=972'>972</A>[ ]<A HREF='findome.php?npa=&nxx=360&usaquerytype=Search+by+Number'>360[ ]<A HREF='findcity.php?cityname=DALLAS&state=TX'>DALLAS[ ]<A HREF='shownpamap.php?graphic=http://fonefinder.net/images/npamap/tx.gif'>Texas</A>[ ]<A HREF='http://fonefinder.net/att.php'>AT&T LOCAL</A>[ ]CLEC[ ]<A HREF='findome.php?npa=972&nxx=360&usaquerytype=Search+by+Number'>More</A>
area = cols[0].replace(linkTitleRe, '$1');
prefix = cols[1].replace(linkTitleRe, '$1');
city = cols[2].replace(linkTitleRe, '$1');
st = cols[2].replace(/.*state=(\w+).*/, '$1');
state = cols[3].replace(linkTitleRe, '$1');
company = cols[4].replace(linkTitleRe, '$1');
carrierLink = cols[4].replace(linkHrefRe, '$1');
if (!/^http/.test(carrierLink)) {
carrierLink = '';
} else {
// the link is incomplete
if ("http://fonefinder.net" === carrierLink) {
carrierLink = '';
}
carrier = carrierLink.replace(/.*fonefinder.net\/(.*).php/, '$1');
// there is no link
if (carrierLink === carrier) {
carrier = '';
}
}
carrierName2 = ''; // placeholder
type = cols[5] || '';
//0, 1, 2, 3, 4, 5, 6, 7, 8, 9
arr.push([area, prefix, city, state, st, company, type, carrierLink, carrier, carrierName2]);
});
return arr;
}
module.exports.scrape = scrape;