-
Notifications
You must be signed in to change notification settings - Fork 0
/
url-scraper.js
144 lines (115 loc) · 4.18 KB
/
url-scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
//var imageTags = document.getElementsByTagName("img"); // Returns array of <img> DOM nodes
var $imageTags = $('img:visible');
//var authCode = "7bvWk1SFf2CBBn9R8c6KJ1P3ne0zre";
var authCode = "umUFgMhzz2cMpj6TexWSdmgzO6FhcY";
//random guy on github's access token: 1INOKIFjD8v6Lv1Swf2qgdOAWmBNhC
var sources = [];
var result;
var results = {};
if(localStorage.imageTag != undefined){
result = localStorage.imageTag;
//console.log("Cached result is ");
//console.log(result);
//console.log(Object.keys(JSON.parse(result)));
}else{
getResult(finish);
}
function getResult(){
for (var i = 0; i < $imageTags.length; i++) {
var srcURL = $imageTags[i];
srcURLWidth = srcURL.clientWidth;
srcURLHeight = srcURL.clientHeight;
//console.log("width: " + srcURLWidth);
var src = $imageTags[i].src;
var LIMITING_SIZE_MIN = 40;
var LIMITING_SIZE_MAX = 3200;
if (typeof(srcURLWidth) != "undefined" && srcURLWidth > LIMITING_SIZE_MIN && srcURLHeight > LIMITING_SIZE_MIN
&& srcURLWidth < LIMITING_SIZE_MAX && srcURLHeight < LIMITING_SIZE_MAX) {
//console.log("result: " + src);
sources.push(src);
}
}
for (var i = 0; i < sources.length; i++) {
query_api(sources[i], makeHashmap);
make_ocr_request(sources[i], makeHashmap);
}
finish();
}
function makeHashmap(url, hash) {
//console.log(hash);
for (var i = 0; i < hash.length; i++) {
var hashLower = hash[i].toLowerCase();
if(results[hashLower] === undefined)
results[hashLower] = [];
if(results[hashLower].indexOf(url) == -1)
results[hashLower].push(url);
}
console.log("resulting hashmap contains:");
console.log (results);
}
function finish(){
if(result == undefined){
result = JSON.stringify(results);
localStorage.imageTag = result;
}
console.log("finish");
console.log(result);
}
function query_api(url, callback) {
$.ajax({
url: "https://api.clarifai.com/v1/tag/?url="+url,
'headers': {
'Authorization': 'Bearer ' + authCode
},
type: "GET",
async: false,
success: function (data) {
var hash = data.results[0].result.tag.classes;
callback(url, hash);
},
error: function(data){
console.log("AJAX error: " + data);
}
});
}
function make_ocr_request(url, callback) {
$.ajax({
url: "https://api.projectoxford.ai/vision/v1.0/ocr?" + "language=unk&detectOrientation=true",
beforeSend: function(xhrObj){
// Request headers
xhrObj.setRequestHeader("Content-Type","application/json");
xhrObj.setRequestHeader("Ocp-Apim-Subscription-Key","c93522f717264b48924915779428dc8c");
// xhrObj.setRequestHeader("Access-Control-Allow-Origin": "http://siteA.com");
},
type: "POST",
async: false,
// Request body
data: "{'Url': '" + url + "'}",
})
.done(function(data) {
console.log("the data returned from the ocr reques was");
console.log(data);
if (data.regions[0] !== undefined){
//console.log(data);
var parsed_JSON = data.regions[0].lines;
console.log("parsed-json is this long " + parsed_JSON.length);
var even_more_parsed = "";
for (var j = 0; j < parsed_JSON.length; j++) {
console.log("outer loops ran");
for (var i = 0; i < parsed_JSON[j].words.length; i++) {
console.log("each word processed was " + parsed_JSON[j].words[i].text);
even_more_parsed += " " + parsed_JSON[j].words[i].text;
}
}
even_more_parsed = even_more_parsed.trim();
var arr = even_more_parsed.split(' ');
console.log("parsed JSON : " + even_more_parsed);
console.log("arr to push to hashmap : " + arr);
console.log("HTTP request for OCR worked");
callback(url, arr);
}
//var parsedata = data.
//alert("success");
});
}
result