- var data = Array.isArray(target.data) ? target.data : [ target.data ];
- for (var j = 0; j < data.length; j++) {
- values.push($(this).data(data[j]));
- }
- } else if (target.attr) {
- // Ex:
,
foo
- values.push($(this).attr(target.attr));
- } else if (target.text) {
- // Ex:
valueskip this
- values.push($(this).contents().filter(function() { return this.nodeType == 3; } ).text()); // 3 = TEXT_NODE
- } else {
- // Ex:
,
value
- var value = target.tags ? $(this).html() : $(this).text();
- values.push(value.trim());
- }
- });
-
- // Run functions defined in config on found values
- if (target.func) {
- var functions = Array.isArray(target.func) ? target.func : [ target.func ];
- for (var j = 0; j < functions.length; j++) {
- var name = functions[j].name;
- var args = functions[j].args;
- for (var k = 0; k < values.length; k++) {
- var value = values[k];
- values[k] = _functions[name](value, args);
- log.silly(logPrefix + 'Run function: ' + name + (args ? ' (' + value + ', ' + JSON.stringify(args) + ')' : ''));
- }
- }
- }
-
- // Store found and processed values in data structure
- if (values.length > 1) {
- // Support joining of values
- if (target.glue) {
- log.silly(logPrefix + 'Glue: ' + values.length + ' items joined with "' + target.glue + '"');
- values = values.join(target.glue);
- }
- data[key] = data[key] ? Array.concat(data[key], values) : values; // join values with same name
- } else if (values.length) {
- data[key] = data[key] ? values.concat(data[key]) : values.pop(); // join values with same name
- }
- }
- }
-}
-
-// Parse functions
-var _functions = {
- regexp: function(value, args) {
- var matches = value.match(new RegExp(args[0]));
- if (matches) {
- return args[1] ? matches[args[1]] : matches;
- } else {
- return null;
- }
- },
- prepend: function(value, text) {
- return text + value;
- },
- append: function(value, text) {
- return value + text;
- },
- join: function(value, args) {
- var str = '';
- for (var i = 0; i < args.length; i++) {
- if (args[i].charAt(0) === '$') {
- if (Array.isArray(value)) {
- var index = parseInt(args[i].substr(1));
- str += value[index];
- }
- else {
- str += value;
- }
- }
- else {
- str += args[i];
- }
- }
- return str;
- },
- replace: function(value, args) {
- // Check if pattern is a regex or string
- var pattern = args[2] && args[2] == 'regexp' ? new RegExp(args[0], 'g') : args[0];
- return value.replace(pattern, args[1]);
- },
- toInt: function(value) {
- return /^\d+$/.test(value) ? parseInt(value, 10) : false;
- },
- parsePrice: function(price, args) {
- // Example inputs: "kr 2.347,95", "969 NOK", "625 kr.", "449.0" (number)
- price = price.toString().replace(/[^\d,.]/g, ''); // strip everything except numbers, "," and "."
- var match = price.match(/^([\d,.]+)[.,](\d{1,2})$/); // split price on decimals if they exist
- if (match) {
- price = match[1].replace(/[,.]/g, ''); // strip "," and "." from the part before the decimals
- price = Math.round(price + '.' + match[2]); // add decimals and round
- } else if (price) {
- price = parseInt(price.replace(/[,.]/g, ''), 10); // strip "," and "." and convert to int
- } else {
- price = null;
- }
- if (args && args.addVat) {
- price = Math.round(price * 1.25); // Add MVA/VAT and round
- }
- return price;
- },
- htmlEntitiesDecode: function(value) {
- return he.decode(value);
- }
-}
-
-
-
-module.exports = Parser;
diff --git a/queue.js b/queue.js
deleted file mode 100644
index 1dc0cbc..0000000
--- a/queue.js
+++ /dev/null
@@ -1,79 +0,0 @@
-var log = require('./log');
-
-var Queue = function(options) {
- this._stack = {
- get: [], // items at current level
- add: [] // items for next level
- }
- this.active = true;
-
- // Handle options and defaults
- options = options || {};
- this._maxLevel = options.maxLevel || false;
- this._maxItems = options.maxItems || false
-
- // Internal
- this._currentLevel = 0;
- this._currentItem = 0;
-}
-
-Queue.prototype.add = function(items) {
- log.verbose('[Queue] Received ' + items.length + ' items');
- if (!this._maxLevel || this._currentLevel < this._maxLevel) {
-
- // Support items not in array
- if (!Array.isArray(items)) {
- items = [ items ];
- }
-
- // Add items to the stack
- var n = items.length;
- var added = 0;
- for (var i = 0; i < n; i++) {
- if (!this._maxItems || this._currentItem < this._maxItems) {
- this._stack.add.push(items[i]);
- this._currentItem++;
- added++;
- log.debug('[Queue] Added item: ' + items[i]);
- } else {
- this.active = false;
- log.debug('[Queue] Reached max items limit of ' + this._maxItems);
- break;
- }
- }
- log.verbose('[Queue] Added ' + added + ' items');
- } else {
- log.verbose('[Queue] No items added');
- }
-}
-
-Queue.prototype.get = function() {
- log.verbose('[Queue] Get item');
- if (this._stack.get.length) {
- return this._stack.get.pop();
- } else {
- if (!this._maxLevel || this._currentLevel < this._maxLevel) {
- // Set next level
- this._currentLevel++;
- log.verbose('[Queue] Starting level ' + this._currentLevel);
-
- // Switch stack
- this._stack.get = this._stack.add;
- this._stack.add = [];
-
- if (this._stack.get.length) {
- return this._stack.get.pop();
- } else {
- this.active = false;
- log.verbose('[Queue] No more items in queue');
- return false;
- }
- } else {
- this.active = false;
- log.verbose('[Queue] Reached max level limit of ' + this._maxLevel);
- return false;
- }
- }
-}
-
-module.exports = Queue;
diff --git a/solr.js b/solr.js
deleted file mode 100644
index 9732815..0000000
--- a/solr.js
+++ /dev/null
@@ -1,33 +0,0 @@
-var solr = require('solr-client')
-var log = require('./log');
-
-var Solr = function(host, port, index) {
- if (index === undefined) {
- log.warn('[Solr] No index provided');
- return;
- }
- host = host || '192.168.1.104';
- port = port || '8983';
- this.client = solr.createClient({
- host: host,
- port: port,
- core: index
- });
-}
-
-Solr.prototype = Object.create(require('events').EventEmitter.prototype);
-
-Solr.prototype.add = function(doc) {
- var self = this;
- this.client.add(doc, function(error, response) {
- if (error) {
- log.error('[Solr] ' + error);
- } else {
- self.client.commit();
- log.verbose('[Solr] Document added');
- self.emit('added', response);
- }
- });
-}
-
-module.exports = Solr;
\ No newline at end of file
diff --git a/src/config.js b/src/config.js
new file mode 100644
index 0000000..7faf775
--- /dev/null
+++ b/src/config.js
@@ -0,0 +1,64 @@
+import fs from 'fs';
+import URI from 'urijs';
+import log from './log';
+
+export default { setPath, get };
+
+let _path = __dirname + '/../config';
+
+function setPath(path) {
+ _path = path;
+}
+
+function get(arg) {
+ if (Number.isInteger(arg)) {
+ return getById(arg);
+ } else if (arg.indexOf('http') !== -1) {
+ return getByUrl(arg);
+ } else if (/^[^/]+\.json$/.test(arg)) {
+ return parse(_path + '/' + arg);
+ } else {
+ return parse(arg);
+ }
+}
+
+function getById(id) {
+ let files = getFiles(_path);
+ for (let i = 0; i < files.length; i++) {
+ if (files[i].match(/-(\d+)/).pop() == id) {
+ return parse(files[i]);
+ }
+ }
+ return false;
+}
+
+function getByUrl(url) {
+ let hostname = new URI(url).hostname();
+ let files = getFiles(_path);
+ for (let i = 0; i < files.length; i++) {
+ let config = parse(files[i]);
+ if (hostname == new URI(config.url).hostname()) {
+ // log.verbose('[Config] Get config for ' + obj.name);
+ return config;
+ }
+ }
+ // log.error('[Config] Missing config for ' + url);
+ return false;
+}
+
+// Recursivly find all files
+function getFiles(path) {
+ var files = [];
+ fs.readdirSync(path).forEach( (file) => {
+ if (fs.statSync(path + '/' + file).isDirectory()) {
+ files = files.concat(getFiles(path + '/' + file));
+ } else {
+ files.push(path + '/' + file);
+ }
+ });
+ return files;
+}
+
+function parse(file) {
+ return JSON.parse(fs.readFileSync(file).toString());
+}
diff --git a/src/download.js b/src/download.js
new file mode 100644
index 0000000..fcc3e01
--- /dev/null
+++ b/src/download.js
@@ -0,0 +1,31 @@
+import request from "request";
+import log from './log';
+
+export default function(url, timeout) {
+ timeout = timeout ? timeout * 1000 : 60000;
+ const t0 = process.hrtime();
+ const options = {
+ url: url,
+ headers: {
+ 'User-Agent': USER_AGENT
+ },
+ timeout: timeout
+ }
+ return new Promise(function (fulfill, reject) {
+ request(options, function (error, response, html) {
+ if (error !== null) {
+ reject(error);
+ } else if (response.statusCode !== 200) {
+ reject('Error! Response code: ' + response.statusCode);
+ } else if (html){
+ var diff = process.hrtime(t0);
+ fulfill({
+ html: html,
+ time: (diff[0] + diff[1] * 1e-9).toFixed(2)
+ });
+ } else {
+ reject('This should not happen');
+ }
+ });
+ });
+}
diff --git a/src/index.js b/src/index.js
new file mode 100644
index 0000000..821f3a6
--- /dev/null
+++ b/src/index.js
@@ -0,0 +1,9 @@
+import config from './config';
+import download from "./download"
+import Parser from "./parser"
+import log from "./log"
+
+global.VERSION = '1.0.0-alpha';
+global.USER_AGENT = 'Edderkopp/' + VERSION;
+
+export { config, download, Parser, log };
diff --git a/log.js b/src/log.js
similarity index 69%
rename from log.js
rename to src/log.js
index f375ea6..93196fc 100644
--- a/log.js
+++ b/src/log.js
@@ -1,4 +1,14 @@
-var winston = require('winston');
+import winston from 'winston';
+/**
+ * Config levels:
+ * silly: 0,
+ * debug: 1,
+ * verbose: 2,
+ * info: 3,
+ * warn: 4,
+ * error: 5
+ */
+
winston.emitErrs = true;
var log = new winston.Logger({
@@ -22,13 +32,8 @@ var log = new winston.Logger({
exitOnError: false
});
-module.exports = log;
+log.setLevel = function(level) {
+ this.transports.console.level = level;
+};
-//npmConfig.levels = {
-// silly: 0,
-// debug: 1,
-// verbose: 2,
-// info: 3,
-// warn: 4,
-// error: 5
-//};
+export default log;
diff --git a/src/parser.js b/src/parser.js
new file mode 100644
index 0000000..e69d825
--- /dev/null
+++ b/src/parser.js
@@ -0,0 +1,135 @@
+import URI from 'urijs';
+import cheerio from "cheerio";
+import log from './log';
+import * as tasks from './parser.tasks';
+
+export default class Parser {
+
+ constructor(html) {
+ this.$ = cheerio.load(html);
+ }
+
+ getData(rules) {
+ return this._recParse(rules);
+ }
+
+ // Recursively parse DOM
+ _recParse(rules, data, $container) {
+ let $ = this.$;
+ data = data || {};
+ for (let i = 0; i < rules.length; i++) {
+ const rule = rules[i];
+ if (rule.name) {
+ const $elem = rule.elem ? $(rule.elem, $container) : $container;
+ if (rule.data == 'array') {
+ data[rule.name] = [];
+ $elem.each((i, e) => {
+ let obj = {};
+ data[rule.name].push(obj);
+ this._recParse(rule.kids, obj, $(e));
+ });
+ } else if (rule.data == 'object') {
+ data[rule.name] = {};
+ this._recParse(rule.kids, data[rule.name], $elem);
+ } else {
+ if ($elem.length > 0) {
+ const values = this._getContent($elem, rule);
+ // Join values with same name
+ data[rule.name] = data[rule.name] ? [].concat(data[rule.name], values) : values;
+ } else if (!rule.null){
+ log.warn('Element not found: ' + rule.elem);
+ }
+ }
+ } else if (rule.elem) {
+ this._recParse(rule.kids, data, $(rule.elem, $container));
+ }
+ }
+ return data;
+ }
+
+ // Get values
+ _getContent($elem, rule) {
+ let $ = this.$;
+ let values = [];
+ const dataType = Array.isArray(rule.data) ? rule.data[0] : rule.data;
+ $elem.each(function() {
+ switch (dataType) {
+ case 'html':
+ // Get all content including tags
+ // Ex:
paragraph 1
paragraph 2
paragraph 3
+ values.push($(this).html().trim());
+ break;
+ case 'txtn':
+ // Get only text nodes
+ // Ex:
skip this get this
skip this
+ values.push($(this).contents().filter(function() {
+ return this.nodeType == 3; // 3 = TEXT_NODE
+ }).text().trim());
+ break;
+ case 'attr':
+ // Get content from attribute
+ // Ex:
,
foo
+ for (let i = 1; i < rule.data.length; i++) {
+ values.push($(this).attr(rule.data[i]));
+ }
+ break;
+ case 'data':
+ // Get content from data
+ // Ex:
+ for (let i = 1; i < rule.data.length; i++) {
+ values.push($(this).data(rule.data[i]));
+ }
+ break;
+ case 'text':
+ default:
+ // Get only text (strip away tags)
+ values.push($(this).text().trim());
+ }
+ });
+
+ // Run tasks on values
+ if (rule.task) {
+ let task;
+ if (typeof rule.task == 'string') {
+ // "task": "foobar"
+ task = [ [ rule.task ] ];
+ } else if (!Array.isArray(rule.task[0])) {
+ // "task": [ "foobar", "arg1", "arg2" ]
+ task = [ rule.task ];
+ } else {
+ // "task": [
+ // [ "foobar1", "arg1", "arg2" ],
+ // [ "foobar2", "arg1", "arg2" ]
+ // ]
+ task = rule.task;
+ }
+ for (let i = 0; i < task.length; i++) {
+ for (let j = 0; j < values.length; j++) {
+ let name = task[i][0];
+ let args = task[i].slice(1);
+ if (tasks[name]) {
+ values[j] = tasks[name](args, values[j]);
+ } else {
+ log.warn('task not exist: ' + name);
+ }
+ }
+ }
+ }
+
+ if (values.length == 1) {
+ values = values.pop();
+ }
+
+ return values;
+ }
+
+ // Support custom tasks
+ static injectTasks(customTasks) {
+ for (var prop in customTasks) {
+ if (tasks[prop]) {
+ log.warn('Overriding task: ' + prop);
+ }
+ tasks[prop] = customTasks[prop];
+ }
+ }
+}
diff --git a/src/parser.tasks.js b/src/parser.tasks.js
new file mode 100644
index 0000000..289789b
--- /dev/null
+++ b/src/parser.tasks.js
@@ -0,0 +1,63 @@
+export {
+ _match as match,
+ _prepend as prepend,
+ _append as append,
+ _join as join,
+ _replace as replace,
+ _parseInt as parseInt
+};
+
+// "task": [ "match", "\\/(\\d+)\\.", 1 ]
+function _match(args, value) {
+ var matches = value.match(new RegExp(args[0]));
+ if (matches) {
+ return args[1] ? matches[args[1]] : matches;
+ } else {
+ return null;
+ }
+}
+
+// "task": [ "prepend", "http://foo.bar/" ]
+function _prepend(args, value) {
+ return args[0] + value;
+}
+
+// "task": [ "append", "&foo=bar" ]
+function _append(args, value) {
+ return value + args[0];
+}
+
+// "task": [ "join", "http://foo.bar/", "$1" ]
+// "task": [ "join", "$1", "$3", "(foobar)", "$2" ]
+function _join(args, value) {
+ var str = '';
+ for (var i = 0; i < args.length; i++) {
+ if (args[i].charAt(0) === '$') {
+ str += value[args[i].substr(1)];
+ } else {
+ str += args[i];
+ }
+ }
+ return str;
+}
+
+// "task": [ "replace", "foo", "bar" ]
+// "task": [ "replace", "[\\r\\n\\t\\s]+", "", "regexp" ]
+function _replace(args, value) {
+ if (typeof args[0] == 'string' && typeof args[1] == 'string') {
+ args[0] = [ args[0] ];
+ args[1] = [ args[1] ];
+ }
+ var pattern;
+ for (var i = 0; i < args[0].length; i++) {
+ pattern = args[2] && args[2] == 'regexp' ? new RegExp(args[0][i], 'g') : args[0][i];
+ value = value.replace(pattern, args[1][i]);
+ }
+ return value;
+}
+
+// "task": [ "parseInt" ]
+function _parseInt(args, value) {
+ value = value ? value.replace(/[^\d]/g, '') : null;
+ return value ? parseInt(value, 10) : null;
+}