diff --git a/.babelrc b/.babelrc new file mode 100644 index 0000000..252f39f --- /dev/null +++ b/.babelrc @@ -0,0 +1 @@ +{ "presets": ["es2015","stage-0"] } diff --git a/LICENSE b/LICENSE index d6a9326..a8e851c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,340 +1,21 @@ -GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Lesser General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - {description} - Copyright (C) {year} {fullname} - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - {signature of Ty Coon}, 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. - +MIT License + +Copyright (c) 2016 Alf Marius Foss Olsen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 7a9b463..2b26c2c 100644 --- a/README.md +++ b/README.md @@ -1 +1,240 @@ -![](edderkopp.png) +**!! WORK IN PROGRESS !!** + +#Config + +Config files are written in **json** and organized so that each *level* has a bunch of *nodes*. +``` +{ + "id": 1, + "url": "http://foo.bar/", + "pages": { + "pageOne": [ .. ], + "pageTwo": [ .. ] + }, + "crawl": { + "delay": 300, + "get": [ .. ], + "skip": [ .. ] + } + +``` + +Each page contains an array of rule-objects. + +``` +{ + "name": "myObject", + "elem": "#container" + "data": "object", + "kids": [ + { + "name": "title", + "elem": ".title" + }, + { + "name": "description", + "elem": ".desc", + "data": "html" + }, + { + "name": "image", + "elem": "img", + "data": [ "attr", "src" ] + } + ] +} +``` +``` +{ + "name": "..", + "elem": "..", + "data": "..", + "task": "..", + "kids": [ .. ] +} + +``` +##Types of nodes + +###1) Object +When you want to get a bunch of unique properties and collect them in an object +``` +{ + "name": "myObject", + "data": "object", + "kids": [ .. ] +} +``` +Result : `{ myObject: { .. } }` + +###2) Array of objects +When you want to get bulks of similar data and collect them in an array (lists, tables, feeds, etc) +``` +{ + "name": "myArray", + "data": "array", + "kids": [ .. ] +} +``` +Result : `{ myArray: [ {..}, {..}, .. ] }` + +###3) Container +When you just want to make a container for searching for more elements (collects nothing) +``` +{ + "elem": ".foo", + "kids": [ .. ] +} +``` +###4) Content +When you want to get the actual content +``` +{ + "name": "myValue", + "elem": ".value", + "data": +} +``` +Details explained further below + +##How to get actual content + +###Element content, strip tags +``` +
+ foo bar +
+``` +``` +{ + "name": "myText", + "elem": ".text", + "data": "text" +} +``` +Result : `{ myText: 'foo bar' }` +Note: This is default + +###Element content, include tags + +``` +
+ foo bar +
+``` +``` +{ + "name": "myHtml", + "elem": ".html", + "data": "html" +} +``` +Result : `{ myHtml: 'foo bar' }` + +###Element content, only text nodes + +``` +
+ foo bar +
+``` +``` +{ + "name": "myTextNode", + "elem": ".text-node", + "data": "txtn" +} +``` +Result : `{ myTextNode: 'bar' }` + +###Element attribute(s) + +``` + +``` +``` +{ + "name": "myAttr", + "elem": ".attr", + "data": [ "attr", "src" ] +} +``` +Result : `{ myAttr: 'http://foo.bar/img.jpg' }` + +Note: *attr* also supports more arguments, returning an array (see *data* below) + +###Element data field(s) + +``` +
+``` +``` +{ + "name": "myData", + "elem": ".data", + "data": [ "data", "small", "big" ] +} +``` +Result : `{ myData: [ 'small.jpg', 'big.jpg' ] }` + +Note: *data* also supports one argument, returning a string (see *attr* above) + +###Special case +Sometimes a node has content you want as well as being a container! Take a look at this scenario: +``` +
+ Click! + .. +
+``` +``` +{ + "name": "myItem", + "elem": ".item", + "data": "object", + "kids": [ + { + "name": "title", + "data": [ "attr", "meta-title" ] + }, + { + "name": "url", + "elem": "a", + "data": [ "attr", "href" ] + }, + .. + ] +} +``` +Result : `{ myItem: { title: 'foobar', url: 'http://foo.bar' } }` +Note: The first child has no *elem* defined, so it uses the parent element. + +#Parser + +###Custom tasks +You can inject your own custom tasks or override those included in the core. +``` +// my-tasks.js +export { _myTask as myTask } +.. +function _myTask(args, value) { + return 'foobar'; +} +.. +``` +``` +// app.js +import { Parser } from "edderkopp"; +import tasks from './my-tasks'; + +// Inject all tasks defined in my-tasks.js +Parser.injectTasks(tasks); + +// Or you can inject an anonymous task +Parser.injectTasks({ + anonTask: function(args, value) { + return 'foobar'; + } +}); +``` +Note: If you feel your custom task should be included in core feel free to make a pull request or create an issue. diff --git a/config.js b/config.js deleted file mode 100644 index ba90420..0000000 --- a/config.js +++ /dev/null @@ -1,88 +0,0 @@ -var fs = require('fs'); -var URI = require('URIjs'); -var log = require('./log'); - -var _config = './config'; -var _shops; - -var Config = function() {} - -// Inject config path -Config.prototype.setPath = function(path) { - if (fs.existsSync(path)) { - _config = path; - } else { - throw new Error('Path "' + path + '" don\'t exist'); - } -} - -// Load all shops -Config.prototype.getShops = function() { - if (!_shops) { - log.verbose('[Config] Get config for all shops'); - var files = getFiles(_config + '/sites/shops'); - var _shops = {}; - for (var i = 0; i < files.length; i++) { - var obj = JSON.parse(fs.readFileSync(files[i]).toString()); - _shops[obj.key] = obj; - _shops[obj.id] = obj; - - } - } - return _shops; -} - -Config.prototype.getByUrl = function(url) { - var hostname = new URI(url).hostname(); - var files = getFiles(_config + '/sites'); - for (var i = 0; i < files.length; i++) { - var obj = JSON.parse(fs.readFileSync(files[i]).toString()); - if (hostname == new URI(obj.url).hostname()) { - log.verbose('[Config] Get config for ' + obj.name); - return obj; - } - } - log.error('[Config] Missing config for ' + url); - return false; -} - -Config.prototype.parseFile = function(file) { - return JSON.parse(fs.readFileSync(file).toString()); -} - -//Config.prototype.load = function(target, key) { -// log.verbose('[Config] Load ' + target + ': ' + key); -// init(); -// -// var file = _config[target] + '/' + key + '.json'; -// if (fs.existsSync(file)) { -// return this[target][key] = JSON.parse(fs.readFileSync(file).toString()); -// } else { -// log.error('[Config] Missing ' + file); -// return false; -// } -//} - - -function getContent(file) { - var obj = JSON.parse(fs.readFileSync(path + '/' + file).toString()); - self[target][obj.key] = obj; - self[target][obj.id] = obj; -} - -function getFiles(path) { - var files = []; - fs.readdirSync(path).forEach(function(file) { - if (fs.statSync(path + '/' + file).isDirectory()) { - files = files.concat(getFiles(path + '/' + file)); - } else { - files.push(path + '/' + file); - } - //log.verbose(file); - //var obj = JSON.parse(fs.readFileSync(path + '/' + file).toString()); - //self[target][obj.key] = obj; - //self[target][obj.id] = obj; - }); - return files; -} -module.exports = Config; diff --git a/customHttpConnector.js b/customHttpConnector.js deleted file mode 100644 index 228ebfb..0000000 --- a/customHttpConnector.js +++ /dev/null @@ -1,41 +0,0 @@ -var HttpConnector = require('elasticsearch/src/lib/connectors/http'); -var qs = require('querystring'); -var inherits = require('util').inherits; -var fs = require('fs'); - -function CustomHttpConnector(host, config) { - HttpConnector.call(this, host, config); -} - -inherits(CustomHttpConnector, HttpConnector); - -CustomHttpConnector.prototype.makeReqParams = function (params) { - params = params || {}; - var host = this.host; - - var reqParams = { - method: params.method || 'GET', - protocol: host.protocol + ':', - auth: host.auth, - hostname: host.host, - port: host.port, - path: (host.path || '') + (params.path || ''), - headers: host.getHeaders(params.headers), - agent: this.agent, - rejectUnauthorized: true, - ca: fs.readFileSync('carlsen.crt', 'utf8') - }; - - if (!reqParams.path) { - reqParams.path = '/'; - } - - var query = host.getQuery(params.query); - if (query) { - reqParams.path = reqParams.path + '?' + qs.stringify(query); - } - - return reqParams; -}; - -module.exports = CustomHttpConnector; diff --git a/download.js b/download.js deleted file mode 100644 index 3e7fef1..0000000 --- a/download.js +++ /dev/null @@ -1,55 +0,0 @@ -var request = require('request'); -var log = require('./log'); -var Promise = require('es6-promise').Promise; - -var Download = function() {} - -Download.prototype = Object.create(require('events').EventEmitter.prototype); - -Download.prototype.get = function(url, obj) { - //var self = this; - var t = process.hrtime(); - var options = { - url: url, - headers: { - 'User-Agent': 'request' - }, - timeout: 60000 - } - return new Promise(function (fulfill, reject) { - request(options, function (error, response, html) { - if (error !== null) { - if (error.code === 'ETIMEDOUT') { - log.warn('[Download] Timeout of ' + options.timeout * 0.001 + 's reached for ' + url); - obj ? fulfill(obj) : fulfill(); - } else { - log.error('[Download] ' + error.toString() + ' (' + url + ')'); - reject(error.toString()); - } - } else if (response.statusCode !== 200) { - log.error('[Download] Status code ' + response.statusCode + ' (' + url + ')'); - if (response.statusCode == 404 || response.statusCode == 500) { - obj ? fulfill(obj) : fulfill(); - } else { - reject(response.statusCode); - } - } else if (html){ - t = process.hrtime(t); - var diff = (t[0] + t[1] * 1e-9).toFixed(2); - log.verbose('[Download] ' + url + ' (' + diff + 's)'); - if (obj) { - obj.html = html; - fulfill(obj); - } else { - fulfill(html); - } - } else { - log.error('[Download] WEIRD!! No errors AND no html'); - reject(); - } - }); - }); - -} - -module.exports = Download; diff --git a/edderkopp.js b/edderkopp.js deleted file mode 100644 index cc398c5..0000000 --- a/edderkopp.js +++ /dev/null @@ -1,113 +0,0 @@ -var log = require('./log'); -var Config = require('./config'); -var Queue = require('./queue'); -var Download = require('./download'); -var Parser = require('./parser'); -var Elasticsearch = require('./elasticsearch'); -var Solr = require('./solr'); - -var Edderkopp = function(options) { - this.config = new Config(); - this.queue = new Queue(); - this.download = new Download(); - this.parser = new Parser(); - this.site = null; // Current site - - if (options) { - if (options.configPath) { - this.config.setPath(options.configPath); - } - if (options.logLevel) { - log.transports.console.level = options.logLevel; - } - } -}; - -//Edderkopp.prototype.initIndex = function(target) { -// if (!this.config.loadAuth(target)) { -// return; -// } -// -// if (target == 'elasticsearch') { -// this.elasticsearch = new Elasticsearch(this.config.elasticsearch); -// } else if (target == 'solr') { -// -// } -//} -// -//Edderkopp.prototype.checkIndex = function() { -// this.initIndex('elasticsearch'); -// if (this.elasticsearch) { -// this.elasticsearch = new Elasticsearch(this.config.elasticsearch); -// this.elasticsearch.info(); -// } -//} - -// DEPRECIATED -Edderkopp.prototype.initSite = function(site) { - log.verbose('[Edderkopp] Init site: ' + site); - - if (!this.config.loadSite(site)) { return false; } - - this.parser.init({ - url: this.config.sites[site].url, - targets: this.config.sites[site].targets, - blacklist: this.config.sites[site].blacklist || {} - }); - return true; -} -// DEPRECIATED -Edderkopp.prototype.getSite = function(site) { - log.verbose('[Edderkopp] Get site: ' + site); - - if (!this.initSite(site)) { - return; - } - this.initIndex('elasticsearch'); - - var self = this; - this.download.get(this.config.site.url); - this.download.on('finished', function(response) { - self.parser.load(response); - if (self.parser.isTarget()) { - var data = self.parser.getData(); - data.url = response.url; - if (self.elasticsearch) { - var id = 'shop_id' + self.config.site.id + 'product_id' + data.shop_product_id; - self.elasticsearch.exists('shopz', 'product', id, function(response) { - if (response) { - log.warn('[Edderkopp] Document exists: ' + id); - } else { - log.verbose('[Edderkopp] Adding document: ' + id); - self.elasticsearch.create(data, 'shopz', 'product', id); - } - - }); - } else { - console.log(data); - } - } - - if (self.queue.active) { - var links = self.parser.getLinks(); - if (links) { - self.queue.add(links); - } - } - - var url = self.queue.get(); - if (url) { - // Delay - if (self.config.site.delay) { - setTimeout(function() { - self.download.get(url); - }, self.config.site.delay) - } else { - self.download.get(url); - } - - } - }); -} - -module.exports = Edderkopp; diff --git a/edderkopp.png b/edderkopp.png deleted file mode 100644 index 63ac236..0000000 Binary files a/edderkopp.png and /dev/null differ diff --git a/elasticsearch.js b/elasticsearch.js deleted file mode 100644 index a2ebe74..0000000 --- a/elasticsearch.js +++ /dev/null @@ -1,85 +0,0 @@ -var elasticsearch = require('elasticsearch'); -var CustomHttpConnector = require('./customHttpConnector'); -var log = require('./log'); - -var Elasticsearch = function(host) { - this.client = new elasticsearch.Client({ - host: host, - //log: { - //level: 'info' - //}, - keepAlive: true, - apiVerison: "1.3", - connectionClass: CustomHttpConnector - }); -} - -Elasticsearch.prototype = Object.create(require('events').EventEmitter.prototype); - - -Elasticsearch.prototype.info = function() { - var options = { - requestTimeout: 1000 - }; - this.client.info(options, function (error, response, status) { - console.log(error); - console.log(status); - console.log(response); - }); - -} -Elasticsearch.prototype.exists = function(index, type, id, cb) { - var options = { - index: index, - type: type, - id: id - }; - this.client.exists(options, function (error, response) { - cb(response); - }); -} - -Elasticsearch.prototype.create = function(doc, index, type, id) { - var self = this; - var options = { - index: index, - type: type, - body: doc - }; - if (id !== undefined) { - options.id = id; - } - this.client.create(options, function (error, response) { - if (error) { - log.error('[Elasticsearch] ' + error); - } else { - log.verbose('[Elasticsearch] Document id:' + id + ' added'); - //self.emit('added', response); - } - }); -} - -Elasticsearch.prototype.update = function(doc, index, type, id) { - var self = this; - var options = { - index: index, - type: type, - body: { - doc: doc, - doc_as_upsert: true - } - }; - if (id !== undefined) { - options.id = id; - } - this.client.update(options, function (error, response) { - if (error) { - log.error('[Elasticsearch] ' + error); - } else { - log.verbose('[Elasticsearch] Document id:' + id + ' added'); - //self.emit('added', response); - } - }); -} - -module.exports = Elasticsearch; diff --git a/example/site.html b/example/site.html new file mode 100644 index 0000000..815b418 --- /dev/null +++ b/example/site.html @@ -0,0 +1,69 @@ + + + + + Foo Bar + + + + +
+
+ +
+
+ + +
+
+

Once upon a time there was an item

+

that lived happily ever after

+
+
+ Get this but not the wrapping span tags or strong tags +
+
+ +
+ +
+
+ Don't get this element + This is what I want! + Don't get this element +
+
+
+

I'm not special

+
+ +
+
+

Square comment

+
+
Blue
+
Metal
+
+
+
Green
+
Wood
+
+
+ +
+

Round comment

+
+
Yellow
+
Plastic
+
+
+
Purple
+
Rubber
+
+
+ +
+
+ + + diff --git a/example/site.json b/example/site.json new file mode 100644 index 0000000..b8c503f --- /dev/null +++ b/example/site.json @@ -0,0 +1,126 @@ +{ + "id": 1234, + "name": "site", + "pages": { + "somePage": [ + { + "name": "item", + "data": "object", + "kids": [ + { + "name": "title", + "elem": "head meta[property=og\\:title]", + "data": [ "attr", "content"] + }, + { + "elem": ".item", + "kids": [ + { + "name": "id", + "data": [ "data", "id"], + "task": [ + [ "match", "\\D*'([\\d]+)\\D*", 1 ], + [ "parseInt" ] + ] + }, + { + "name": "image", + "elem": ".image-main img, .image-extra img", + "data": [ "attr", "src" ], + "task": [ + [ "replace", "thumb", "big" ], + [ "prepend", "https://foo.bar/" ] + ] + }, + { + "name": "description", + "elem": ".description", + "data": "html" + }, + { + "name": "short", + "elem": ".short" + }, + { + "elem": ".wrapper", + "kids": [ + { + "name": "subtitle", + "elem": "meta[itemprop=title]", + "data": [ "attr", "content" ] + }, + { + "name": "image", + "elem": ".image-wrapped img", + "data": [ "attr", "src" ], + "task": [ + [ "replace", "thumb", "big" ], + [ "prepend", "https://bar.foo/" ], + [ "append", "?foo=bar" ] + ] + }, + { + "name": "textOnly", + "elem": ".text-only", + "data": "txtn" + } + ] + }, + { + "name": "special", + "elem": ".special", + "data": "object", + "kids": [ + { + "name": "first", + "data": [ "data", "title" ] + }, + { + "name": "second", + "elem": "p" + } + ] + }, + { + "name": "objects", + "data": "object", + "kids": [ + { + "name": "objectWrapper", + "elem": ".objects .group", + "data": "array", + "kids": [ + { + "name": "group", + "data": [ "attr", "content" ] + }, + { + "name": "comment", + "elem": ".comment" + }, + { + "name": "details", + "elem": ".details", + "data": "array", + "kids": [ + { + "name": "color", + "elem": ".color" + }, + { + "name": "material", + "elem": ".material" + } + ] + } + ] + } + ] + } + ] + } + ] + } + ] + } +} diff --git a/getPage.js b/getPage.js deleted file mode 100755 index f40b153..0000000 --- a/getPage.js +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env node -var log = require('./log'); -var Edderkopp = require('./edderkopp'); - -//log.transports.console.level = 'verbose'; -log.transports.console.level = 'debug'; -log.transports.console.prettyPrint = true; - -// Check argv -var url = process.argv[2]; -if (url === undefined) { - var thisFile = process.argv[1].split('/').pop(); - log.info('Usage: ' + thisFile + ' '); - process.exit(1); -} - -// Init Edderkopp -var edderkopp = new Edderkopp(); - -edderkopp.config.setPath('/nfs/home/alf/git/prisguide/node/config'); -var config = edderkopp.config.getByUrl(url); -if (config) { - edderkopp.download.get(url).then(function(html) { - - // Load parser with html etc - log.verbose('[getPage] Parse'); - var obj = { - url: url, - config: config, - html: html - } - - // Parse html and get data specified in config - var data = edderkopp.parser.getData(obj); - //log.info(data.web); - }).catch(function (error) { - log.error(error); - }); -} diff --git a/getSite.js b/getSite.js deleted file mode 100755 index d82aec2..0000000 --- a/getSite.js +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env node -var log = require('./log'); -var Edderkopp = require('./edderkopp'); - -log.transports.console.level = 'verbose'; - -// Check argv -var site = process.argv[2]; -if (site === undefined) { - var thisFile = process.argv[1].split('/').pop(); - log.info('Usage: ' + thisFile + ' '); - process.exit(1); -} - -var edderkopp = new Edderkopp(); -edderkopp.getSite(site); diff --git a/package.json b/package.json index 1c05ea1..4c6aa0c 100644 --- a/package.json +++ b/package.json @@ -1,29 +1,39 @@ { "name": "edderkopp", - "version": "0.1.0", - "description": "Crawl webpages and get data according to config file", - "main": "edderkopp.js", + "version": "1.0.0-alpha", + "description": "Crawl and scrape webpages for data using simple json config files", + "main": "dist/index.js", "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" + "build": "babel src -d dist", + "watch": "babel --watch src -d dist" }, "repository": { "type": "git", "url": "git+https://github.com/fractalf/edderkopp.git" }, - "author": "Alf Marius Foss Olsen", - "license": "GNU GENERAL PUBLIC LICENSE", + "keywords": [ + "crawler", + "es6", + "es2015" + ], + "author": "Alf Marius Foss Olsen ", + "license": "MIT", "bugs": { "url": "https://github.com/fractalf/edderkopp/issues" }, - "homepage": "https://github.com/fractalf/edderkopp#readme", + "homepage": "https://github.com/fractalf/edderkopp", "dependencies": { - "URIjs": "^1.15.2", - "cheerio": "^0.19.0", - "elasticsearch": "^5.0.0", - "es6-promise": "^2.3.0", - "he": "^0.5.0", - "request": "^2.58.0", - "solr-client": "^0.5.0", - "winston": "^1.0.1" + "bluebird": "^3.4.0", + "blueimp-md5": "^2.3.0", + "cheerio": "^0.20.0", + "request": "^2.72.0", + "robots-parser": "^1.0.0", + "urijs": "^1.18.1", + "winston": "^2.2.0" + }, + "devDependencies": { + "babel-cli": "^6.9.0", + "babel-preset-es2015": "^6.9.0", + "babel-preset-stage-0": "^6.5.0" } } diff --git a/parser.js b/parser.js deleted file mode 100644 index 2fbae01..0000000 --- a/parser.js +++ /dev/null @@ -1,309 +0,0 @@ -var cheerio = require('cheerio'); -var $; -var URI = require('URIjs'); -var he = require('he'); -var crypto = require('crypto'); -var log = require('./log'); - -//var _config; -var Parser = function() {} - -Parser.prototype.init = function(options) { - this._targets = options.targets; - this._blacklist = options.blacklist; - this._siteUri = new URI(options.url); - this._cache = {}; - this._cacheFile = {}; -} - -Parser.prototype.load = function(html, config, url) { - log.verbose('[Parser] Load html and inject config'); - $ = cheerio.load(html); - _config = config; -} - -Parser.prototype.getLinks = function() { - log.verbose('[Parser] Get links'); - - var links = []; - var selector = 'a[rel!=nofollow]'; - var blacklist = this._blacklist; - var siteUri = this._siteUri; - var cache = this._cache; - var cacheFile = this._cacheFile; - - if (blacklist.classes !== undefined) { - selector += ':not(' + blacklist.classes.join(',') + ')'; - } - - $(selector).each(function() { - var url = $(this).attr('href'); - - // Skip links that has no href - if (url === undefined || !url) { - var outerHTML = $('
').append($(this)).html(); - log.debug('[Parser] No link: ' + outerHTML); - return; - } - url = url.trim(); - log.silly('[Parser] Found url: ' + url); - - // Skip blacklisted paths - if (blacklist.paths !== undefined) { - for (var i = 0; i < blacklist.paths.length; i++) { - if (url.indexOf(blacklist.paths[i]) !== -1) { - log.debug('[Parser] Skip ignored path (' + blacklist.paths[i] + '): ' + url); - return; - } - } - } - - var uri = URI(url); - - // Prepend domain to relative path (Example: ) - if (uri.is('relative')) { - uri.protocol(siteUri.protocol()).hostname(siteUri.hostname()); - } - - // Skip external links - if (uri.hostname() !== siteUri.hostname()) { - log.debug('[Parser] Skip external link: ' + url); - return; - } - - // Skip media files - if (uri.suffix().match(/jpg|jpeg|png|gif|bmp|svg|pdf/i) !== null) { - log.debug('[Parser] Skip media file: ' + url); - return; - } - - // Remove anchor (Example: http://domain/page.html#anchor) - uri.hash(''); - - // Remove trailing slash (Example: http://domain/section/) -> important? - - url = uri.normalize().toString(); - log.silly('[Parser] Processed: ' + url); - - // Skip previously downloaded pages - var md5 = crypto.createHash('md5').update(url).digest('hex'); - if (cache[md5] === undefined) { - cache[md5] = true; - log.silly('[Parser] Send to queue'); - } else { - log.silly('[Parser] Already fetched'); - return; - } - - // Skip previously downloaded filenames - if (uri.filename()) { - var md5file = crypto.createHash('md5').update(uri.filename()).digest('hex'); - if (cacheFile[md5file] === undefined) { - cacheFile[md5file] = [ url ]; - } else { - cacheFile[md5file].push(url); - log.warn(cacheFile[md5file]); - return; - } - } - - links.push(url); - }); - - if (links.length) { - log.verbose('[Parser] ' + links.length + ' new links'); - return links; - } else { - log.verbose('[Parser] No new links'); - return false; - } - -} - -Parser.prototype.getData = function(obj) { - log.verbose('[Parser] Parse content of ' + obj.url); - if (!obj.html) { - log.error('[Parser] No html for: ' + obj.url); - return obj; - } - $ = cheerio.load(obj.html); - obj.web = {}; - pageParser(null, obj.config.targets, obj.web); - log.debug(obj.web); - return obj; -} - -function pageParser($container, targets, data, depth) { - depth = depth || 1; - var logPrefix = '[Parser] ' + Array(depth).join(" ") + ''; - log.silly(logPrefix + 'Depth: ' + depth); - for (var i = 0; i < targets.length; i++) { - var target = targets[i]; - - // Support single and multiple elements. Ex: '.foobar' and ['.foo', '.bar'] - var elements = Array.isArray(target.elem) ? target.elem : [ target.elem ]; - - var $elem; - if (target.ifelse) { - // Only use the first found element in array - for (var l = 0; l < elements.length; l++) { - $elem = $container !== null ? $(elements[l], $container) : $(elements[l]); - if ($elem.length) { break; } - } - } else { - $elem = $container !== null ? $(elements.join(','), $container) : $(elements.join(',')); - } - - var msg = target.type + ': ' + target.elem + (target.name ? ' (' + target.name + ')' : ''); - if ($elem.length === 0) { - if (target.miss) { - var $missing = $container !== null ? $(target.miss, $container) : $(target.miss); - if ($missing.length) { - continue; - } - } else if (target.optional) { - continue; - } - log.warn('[Parser] Couldn\'t find ' + msg); - continue; - } - - log.silly(logPrefix + 'Found ' + msg); - var key = target.name; - if (target.type == 'container') { - if (!key) { - pageParser($elem, target.children, data, depth + 1); - } else if ($elem.length > 1) { - log.silly(logPrefix + 'Name: ' + key); - $elem.each(function() { - if (data[key] === undefined) { - data[key] = []; - } - var obj = {}; - data[key].push(obj); - pageParser($(this), target.children, obj, depth + 1); - }); - } else { - data[key] = {}; - log.silly(logPrefix + 'Name: ' + key); - pageParser($elem, target.children, data[key], depth + 1); - } - } else if (target.type == 'data') { - // Get value(s) from the data attribute, a custom attribute or content of tag - var values = []; - $elem.each(function() { - if (target.data) { - // Ex:
- var data = Array.isArray(target.data) ? target.data : [ target.data ]; - for (var j = 0; j < data.length; j++) { - values.push($(this).data(data[j])); - } - } else if (target.attr) { - // Ex: , foo - values.push($(this).attr(target.attr)); - } else if (target.text) { - // Ex:
valueskip this
- values.push($(this).contents().filter(function() { return this.nodeType == 3; } ).text()); // 3 = TEXT_NODE - } else { - // Ex:

value 1

value 2

,
value
- var value = target.tags ? $(this).html() : $(this).text(); - values.push(value.trim()); - } - }); - - // Run functions defined in config on found values - if (target.func) { - var functions = Array.isArray(target.func) ? target.func : [ target.func ]; - for (var j = 0; j < functions.length; j++) { - var name = functions[j].name; - var args = functions[j].args; - for (var k = 0; k < values.length; k++) { - var value = values[k]; - values[k] = _functions[name](value, args); - log.silly(logPrefix + 'Run function: ' + name + (args ? ' (' + value + ', ' + JSON.stringify(args) + ')' : '')); - } - } - } - - // Store found and processed values in data structure - if (values.length > 1) { - // Support joining of values - if (target.glue) { - log.silly(logPrefix + 'Glue: ' + values.length + ' items joined with "' + target.glue + '"'); - values = values.join(target.glue); - } - data[key] = data[key] ? Array.concat(data[key], values) : values; // join values with same name - } else if (values.length) { - data[key] = data[key] ? values.concat(data[key]) : values.pop(); // join values with same name - } - } - } -} - -// Parse functions -var _functions = { - regexp: function(value, args) { - var matches = value.match(new RegExp(args[0])); - if (matches) { - return args[1] ? matches[args[1]] : matches; - } else { - return null; - } - }, - prepend: function(value, text) { - return text + value; - }, - append: function(value, text) { - return value + text; - }, - join: function(value, args) { - var str = ''; - for (var i = 0; i < args.length; i++) { - if (args[i].charAt(0) === '$') { - if (Array.isArray(value)) { - var index = parseInt(args[i].substr(1)); - str += value[index]; - } - else { - str += value; - } - } - else { - str += args[i]; - } - } - return str; - }, - replace: function(value, args) { - // Check if pattern is a regex or string - var pattern = args[2] && args[2] == 'regexp' ? new RegExp(args[0], 'g') : args[0]; - return value.replace(pattern, args[1]); - }, - toInt: function(value) { - return /^\d+$/.test(value) ? parseInt(value, 10) : false; - }, - parsePrice: function(price, args) { - // Example inputs: "kr 2.347,95", "969 NOK", "625 kr.", "449.0" (number) - price = price.toString().replace(/[^\d,.]/g, ''); // strip everything except numbers, "," and "." - var match = price.match(/^([\d,.]+)[.,](\d{1,2})$/); // split price on decimals if they exist - if (match) { - price = match[1].replace(/[,.]/g, ''); // strip "," and "." from the part before the decimals - price = Math.round(price + '.' + match[2]); // add decimals and round - } else if (price) { - price = parseInt(price.replace(/[,.]/g, ''), 10); // strip "," and "." and convert to int - } else { - price = null; - } - if (args && args.addVat) { - price = Math.round(price * 1.25); // Add MVA/VAT and round - } - return price; - }, - htmlEntitiesDecode: function(value) { - return he.decode(value); - } -} - - - -module.exports = Parser; diff --git a/queue.js b/queue.js deleted file mode 100644 index 1dc0cbc..0000000 --- a/queue.js +++ /dev/null @@ -1,79 +0,0 @@ -var log = require('./log'); - -var Queue = function(options) { - this._stack = { - get: [], // items at current level - add: [] // items for next level - } - this.active = true; - - // Handle options and defaults - options = options || {}; - this._maxLevel = options.maxLevel || false; - this._maxItems = options.maxItems || false - - // Internal - this._currentLevel = 0; - this._currentItem = 0; -} - -Queue.prototype.add = function(items) { - log.verbose('[Queue] Received ' + items.length + ' items'); - if (!this._maxLevel || this._currentLevel < this._maxLevel) { - - // Support items not in array - if (!Array.isArray(items)) { - items = [ items ]; - } - - // Add items to the stack - var n = items.length; - var added = 0; - for (var i = 0; i < n; i++) { - if (!this._maxItems || this._currentItem < this._maxItems) { - this._stack.add.push(items[i]); - this._currentItem++; - added++; - log.debug('[Queue] Added item: ' + items[i]); - } else { - this.active = false; - log.debug('[Queue] Reached max items limit of ' + this._maxItems); - break; - } - } - log.verbose('[Queue] Added ' + added + ' items'); - } else { - log.verbose('[Queue] No items added'); - } -} - -Queue.prototype.get = function() { - log.verbose('[Queue] Get item'); - if (this._stack.get.length) { - return this._stack.get.pop(); - } else { - if (!this._maxLevel || this._currentLevel < this._maxLevel) { - // Set next level - this._currentLevel++; - log.verbose('[Queue] Starting level ' + this._currentLevel); - - // Switch stack - this._stack.get = this._stack.add; - this._stack.add = []; - - if (this._stack.get.length) { - return this._stack.get.pop(); - } else { - this.active = false; - log.verbose('[Queue] No more items in queue'); - return false; - } - } else { - this.active = false; - log.verbose('[Queue] Reached max level limit of ' + this._maxLevel); - return false; - } - } -} - -module.exports = Queue; diff --git a/solr.js b/solr.js deleted file mode 100644 index 9732815..0000000 --- a/solr.js +++ /dev/null @@ -1,33 +0,0 @@ -var solr = require('solr-client') -var log = require('./log'); - -var Solr = function(host, port, index) { - if (index === undefined) { - log.warn('[Solr] No index provided'); - return; - } - host = host || '192.168.1.104'; - port = port || '8983'; - this.client = solr.createClient({ - host: host, - port: port, - core: index - }); -} - -Solr.prototype = Object.create(require('events').EventEmitter.prototype); - -Solr.prototype.add = function(doc) { - var self = this; - this.client.add(doc, function(error, response) { - if (error) { - log.error('[Solr] ' + error); - } else { - self.client.commit(); - log.verbose('[Solr] Document added'); - self.emit('added', response); - } - }); -} - -module.exports = Solr; \ No newline at end of file diff --git a/src/config.js b/src/config.js new file mode 100644 index 0000000..7faf775 --- /dev/null +++ b/src/config.js @@ -0,0 +1,64 @@ +import fs from 'fs'; +import URI from 'urijs'; +import log from './log'; + +export default { setPath, get }; + +let _path = __dirname + '/../config'; + +function setPath(path) { + _path = path; +} + +function get(arg) { + if (Number.isInteger(arg)) { + return getById(arg); + } else if (arg.indexOf('http') !== -1) { + return getByUrl(arg); + } else if (/^[^/]+\.json$/.test(arg)) { + return parse(_path + '/' + arg); + } else { + return parse(arg); + } +} + +function getById(id) { + let files = getFiles(_path); + for (let i = 0; i < files.length; i++) { + if (files[i].match(/-(\d+)/).pop() == id) { + return parse(files[i]); + } + } + return false; +} + +function getByUrl(url) { + let hostname = new URI(url).hostname(); + let files = getFiles(_path); + for (let i = 0; i < files.length; i++) { + let config = parse(files[i]); + if (hostname == new URI(config.url).hostname()) { + // log.verbose('[Config] Get config for ' + obj.name); + return config; + } + } + // log.error('[Config] Missing config for ' + url); + return false; +} + +// Recursivly find all files +function getFiles(path) { + var files = []; + fs.readdirSync(path).forEach( (file) => { + if (fs.statSync(path + '/' + file).isDirectory()) { + files = files.concat(getFiles(path + '/' + file)); + } else { + files.push(path + '/' + file); + } + }); + return files; +} + +function parse(file) { + return JSON.parse(fs.readFileSync(file).toString()); +} diff --git a/src/download.js b/src/download.js new file mode 100644 index 0000000..fcc3e01 --- /dev/null +++ b/src/download.js @@ -0,0 +1,31 @@ +import request from "request"; +import log from './log'; + +export default function(url, timeout) { + timeout = timeout ? timeout * 1000 : 60000; + const t0 = process.hrtime(); + const options = { + url: url, + headers: { + 'User-Agent': USER_AGENT + }, + timeout: timeout + } + return new Promise(function (fulfill, reject) { + request(options, function (error, response, html) { + if (error !== null) { + reject(error); + } else if (response.statusCode !== 200) { + reject('Error! Response code: ' + response.statusCode); + } else if (html){ + var diff = process.hrtime(t0); + fulfill({ + html: html, + time: (diff[0] + diff[1] * 1e-9).toFixed(2) + }); + } else { + reject('This should not happen'); + } + }); + }); +} diff --git a/src/index.js b/src/index.js new file mode 100644 index 0000000..821f3a6 --- /dev/null +++ b/src/index.js @@ -0,0 +1,9 @@ +import config from './config'; +import download from "./download" +import Parser from "./parser" +import log from "./log" + +global.VERSION = '1.0.0-alpha'; +global.USER_AGENT = 'Edderkopp/' + VERSION; + +export { config, download, Parser, log }; diff --git a/log.js b/src/log.js similarity index 69% rename from log.js rename to src/log.js index f375ea6..93196fc 100644 --- a/log.js +++ b/src/log.js @@ -1,4 +1,14 @@ -var winston = require('winston'); +import winston from 'winston'; +/** + * Config levels: + * silly: 0, + * debug: 1, + * verbose: 2, + * info: 3, + * warn: 4, + * error: 5 + */ + winston.emitErrs = true; var log = new winston.Logger({ @@ -22,13 +32,8 @@ var log = new winston.Logger({ exitOnError: false }); -module.exports = log; +log.setLevel = function(level) { + this.transports.console.level = level; +}; -//npmConfig.levels = { -// silly: 0, -// debug: 1, -// verbose: 2, -// info: 3, -// warn: 4, -// error: 5 -//}; +export default log; diff --git a/src/parser.js b/src/parser.js new file mode 100644 index 0000000..e69d825 --- /dev/null +++ b/src/parser.js @@ -0,0 +1,135 @@ +import URI from 'urijs'; +import cheerio from "cheerio"; +import log from './log'; +import * as tasks from './parser.tasks'; + +export default class Parser { + + constructor(html) { + this.$ = cheerio.load(html); + } + + getData(rules) { + return this._recParse(rules); + } + + // Recursively parse DOM + _recParse(rules, data, $container) { + let $ = this.$; + data = data || {}; + for (let i = 0; i < rules.length; i++) { + const rule = rules[i]; + if (rule.name) { + const $elem = rule.elem ? $(rule.elem, $container) : $container; + if (rule.data == 'array') { + data[rule.name] = []; + $elem.each((i, e) => { + let obj = {}; + data[rule.name].push(obj); + this._recParse(rule.kids, obj, $(e)); + }); + } else if (rule.data == 'object') { + data[rule.name] = {}; + this._recParse(rule.kids, data[rule.name], $elem); + } else { + if ($elem.length > 0) { + const values = this._getContent($elem, rule); + // Join values with same name + data[rule.name] = data[rule.name] ? [].concat(data[rule.name], values) : values; + } else if (!rule.null){ + log.warn('Element not found: ' + rule.elem); + } + } + } else if (rule.elem) { + this._recParse(rule.kids, data, $(rule.elem, $container)); + } + } + return data; + } + + // Get values + _getContent($elem, rule) { + let $ = this.$; + let values = []; + const dataType = Array.isArray(rule.data) ? rule.data[0] : rule.data; + $elem.each(function() { + switch (dataType) { + case 'html': + // Get all content including tags + // Ex:

paragraph 1

paragraph 2

paragraph 3

+ values.push($(this).html().trim()); + break; + case 'txtn': + // Get only text nodes + // Ex: skip this get this skip this + values.push($(this).contents().filter(function() { + return this.nodeType == 3; // 3 = TEXT_NODE + }).text().trim()); + break; + case 'attr': + // Get content from attribute + // Ex: , foo + for (let i = 1; i < rule.data.length; i++) { + values.push($(this).attr(rule.data[i])); + } + break; + case 'data': + // Get content from data + // Ex:
+ for (let i = 1; i < rule.data.length; i++) { + values.push($(this).data(rule.data[i])); + } + break; + case 'text': + default: + // Get only text (strip away tags) + values.push($(this).text().trim()); + } + }); + + // Run tasks on values + if (rule.task) { + let task; + if (typeof rule.task == 'string') { + // "task": "foobar" + task = [ [ rule.task ] ]; + } else if (!Array.isArray(rule.task[0])) { + // "task": [ "foobar", "arg1", "arg2" ] + task = [ rule.task ]; + } else { + // "task": [ + // [ "foobar1", "arg1", "arg2" ], + // [ "foobar2", "arg1", "arg2" ] + // ] + task = rule.task; + } + for (let i = 0; i < task.length; i++) { + for (let j = 0; j < values.length; j++) { + let name = task[i][0]; + let args = task[i].slice(1); + if (tasks[name]) { + values[j] = tasks[name](args, values[j]); + } else { + log.warn('task not exist: ' + name); + } + } + } + } + + if (values.length == 1) { + values = values.pop(); + } + + return values; + } + + // Support custom tasks + static injectTasks(customTasks) { + for (var prop in customTasks) { + if (tasks[prop]) { + log.warn('Overriding task: ' + prop); + } + tasks[prop] = customTasks[prop]; + } + } +} diff --git a/src/parser.tasks.js b/src/parser.tasks.js new file mode 100644 index 0000000..289789b --- /dev/null +++ b/src/parser.tasks.js @@ -0,0 +1,63 @@ +export { + _match as match, + _prepend as prepend, + _append as append, + _join as join, + _replace as replace, + _parseInt as parseInt +}; + +// "task": [ "match", "\\/(\\d+)\\.", 1 ] +function _match(args, value) { + var matches = value.match(new RegExp(args[0])); + if (matches) { + return args[1] ? matches[args[1]] : matches; + } else { + return null; + } +} + +// "task": [ "prepend", "http://foo.bar/" ] +function _prepend(args, value) { + return args[0] + value; +} + +// "task": [ "append", "&foo=bar" ] +function _append(args, value) { + return value + args[0]; +} + +// "task": [ "join", "http://foo.bar/", "$1" ] +// "task": [ "join", "$1", "$3", "(foobar)", "$2" ] +function _join(args, value) { + var str = ''; + for (var i = 0; i < args.length; i++) { + if (args[i].charAt(0) === '$') { + str += value[args[i].substr(1)]; + } else { + str += args[i]; + } + } + return str; +} + +// "task": [ "replace", "foo", "bar" ] +// "task": [ "replace", "[\\r\\n\\t\\s]+", "", "regexp" ] +function _replace(args, value) { + if (typeof args[0] == 'string' && typeof args[1] == 'string') { + args[0] = [ args[0] ]; + args[1] = [ args[1] ]; + } + var pattern; + for (var i = 0; i < args[0].length; i++) { + pattern = args[2] && args[2] == 'regexp' ? new RegExp(args[0][i], 'g') : args[0][i]; + value = value.replace(pattern, args[1][i]); + } + return value; +} + +// "task": [ "parseInt" ] +function _parseInt(args, value) { + value = value ? value.replace(/[^\d]/g, '') : null; + return value ? parseInt(value, 10) : null; +}