ageitgey · Kyaw-Zin-Thant · May 15, 2017 · May 15, 2017 · May 15, 2017
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,6 @@
+###1.1.2
+* Add support for Chinese.
+
 ### 1.0.0
 * Add support for extracting out `softTitle`, `date`, `copyright`, `author`, `publisher` thanks to @philgooch. See [#49](https://github.com/ageitgey/node-unfluff/pull/49).
 

diff --git a/README.md b/README.md
@@ -1,14 +1,14 @@
-# unfluff
+# node-article-extractor
 
-An automatic web page content extractor for Node.js!
+An automatic web page content extractor for Node.js! based on https://github.com/ageitgey/node-unfluff, but support Chinese.
 
-[![Build Status](https://travis-ci.org/ageitgey/node-unfluff.svg?branch=master)](https://travis-ci.org/ageitgey/node-unfluff)
+[![Build Status](https://api.travis-ci.org/ahkimkoo/node-article-extractor.svg?branch=master)](https://travis-ci.org/ahkimkoo/node-article-extractor)
 
 Automatically grab the main
 text out of a webpage like this:
 
 ```
-extractor = require('unfluff');
+extractor = require('node-article-extractor');
 data = extractor(my_html_data);
 console.log(data.text);
 ```
@@ -38,21 +38,21 @@ check out those libraries!
 
 ## Install
 
-To install the command-line `unfluff` utility:
+To install the command-line `node-article-extractor` utility:
 
-    npm install -g unfluff
+    npm install -g node-article-extractor
 
-To install the `unfluff` module for use in your Node.js project:
+To install the `node-article-extractor` module for use in your Node.js project:
 
-    npm install --save unfluff
+    npm install --save node-article-extractor
 
 ## Usage
 
-You can use `unfluff` from node or right on the command line!
+You can use `node-article-extractor` from node or right on the command line!
 
 ### Extracted data elements
 
-This is what `unfluff` will try to grab from a web page:
+This is what `node-article-extractor` will try to grab from a web page:
 - `title` - The document's title (from the &lt;title&gt; tag)
 - `softTitle` - A version of `title` with less truncation
 - `date` - The document's publication date
@@ -73,33 +73,33 @@ This is returned as a simple json object.
 
 ### Command line interface
 
-You can pass a webpage to unfluff and it will try to parse out the interesting
+You can pass a webpage to node-article-extractor and it will try to parse out the interesting
 bits.
 
 You can either pass in a file name:
 
 ```
-unfluff my_file.html
+node-article-extractor my_file.html
 ```
 
 Or you can pipe it in:
 
 ```
-curl -s "http://somesite.com/page" | unfluff
+curl -s "http://somesite.com/page" | node-article-extractor
 ```
 
 You can easily chain this together with other unix commands to do cool stuff.
 For example, you can download a web page, parse it and then use
 [jq](http://stedolan.github.io/jq/) to print it just the body text.
 
 ```
-curl -s "http://www.polygon.com/2014/6/26/5842180/shovel-knight-review-pc-3ds-wii-u" | unfluff | jq -r .text
+curl -s "http://www.polygon.com/2014/6/26/5842180/shovel-knight-review-pc-3ds-wii-u" | node-article-extractor | jq -r .text
 ```
 
 And here's how to find the top 10 most common words in an article:
 
 ```
-curl -s "http://www.polygon.com/2014/6/26/5842180/shovel-knight-review-pc-3ds-wii-u" | unfluff |  tr -c '[:alnum:]' '[\n*]' | sort | uniq -c | sort -nr | head -10
+curl -s "http://www.polygon.com/2014/6/26/5842180/shovel-knight-review-pc-3ds-wii-u" | node-article-extractor |  tr -c '[:alnum:]' '[\n*]' | sort | uniq -c | sort -nr | head -10
 ```
 
 ### Module Interface
@@ -116,15 +116,15 @@ The extraction algorithm depends heavily on the language, so it probably won't w
 if you have the language set incorrectly.
 
 ```javascript
-extractor = require('unfluff');
+extractor = require('node-article-extractor');
 
 data = extractor(my_html_data);
 ```
 
 Or supply the language code yourself:
 
 ```javascript
-extractor = require('unfluff');
+extractor = require('node-article-extractor');
 
 data = extractor(my_html_data, 'en');
 ```
@@ -169,7 +169,7 @@ are replaced by functions and evaluation is only done when you call those
 functions.
 
 ```javascript
-extractor = require('unfluff');
+extractor = require('node-article-extractor');
 
 data = extractor.lazy(my_html_data, 'en');
 
@@ -196,24 +196,15 @@ and looking them up multiple times should be as fast as possible.
 
 ### Demo
 
-The easiest way to try out `unfluff` is to just install it:
+The easiest way to try out `node-article-extractor` is to just install it:
 
 ```
-$ npm install -g unfluff
-$ curl -s "http://www.cnn.com/2014/07/07/world/americas/mexico-earthquake/index.html" | unfluff
+$ npm install -g node-article-extractor
+$ curl -s "http://www.cnn.com/2014/07/07/world/americas/mexico-earthquake/index.html" | node-article-extractor
 ```
 
 But if you can't be bothered, you can check out
 [fetch text](http://fetchtext.herokuapp.com/). It's a site by
-[Andy Jiang](https://twitter.com/andyjiang) that uses `unfluff`. You send an
+[Andy Jiang](https://twitter.com/andyjiang) that uses `node-article-extractor`. You send an
 email with a url and it emails back with the cleaned content of that url. It
-should give you a good idea of how `unfluff` handles different urls.
-
-### What is broken
-
-- Parsing web pages in languages other than English is poorly tested and probably
-  is buggy right now.
-- This definitely won't work yet for languages like Chinese / Arabic / Korean /
-  etc that need smarter word tokenization.
-- This has only been tested on a limited set of web pages. There are probably lots
-  of lurking bugs with web pages that haven't been tested yet.
+should give you a good idea of how `node-article-extractor` handles different urls.
diff --git a/bin/unfluff → bin/node-article-extractor b/bin/unfluff → bin/node-article-extractor
diff --git a/lib/formatter.js b/lib/formatter.js
diff --git a/lib/stopwords.js b/lib/stopwords.js
@@ -1,4 +1,4 @@
-// Generated by CoffeeScript 2.0.0-beta7
+var nodejieba = require("nodejieba");
 void function () {
   var _, cache, candiateWords, fs, getFilePath, path, removePunctuation, stopwords;
   path = require('path');
@@ -11,7 +11,8 @@ void function () {
   module.exports = stopwords = function (content, language) {
     var count, filePath, overlappingStopwords, stopWords, strippedInput, words;
     if (null == language)
-      language = 'en';
+      //language = 'en';
+      language = 'zh';
     filePath = getFilePath(language);
     if (!fs.existsSync(filePath)) {
       console.error("WARNING: No stopwords file found for '" + language + "' - defaulting to English!");
@@ -20,7 +21,7 @@ void function () {
     if (cache.hasOwnProperty(language)) {
       stopWords = cache[language];
     } else {
-      stopWords = fs.readFileSync(filePath).toString().split('\n').filter(function (s) {
+      stopWords = fs.readFileSync(filePath).toString().split(/[\n\r]+/).filter(function (s) {
         return s.length > 0;
       });
       cache[language] = stopWords;
@@ -41,9 +42,11 @@ void function () {
     };
   };
   removePunctuation = function (content) {
-    return content.replace(/[\|\@\<\>\[\]\"\'\.,-\/#\?!$%\^&\*\+;:{}=\-_`~()]/g, '');
+    return content.replace(/[\|\@\<\>\[\]\"\'\.,-\/#\?!$%\^&\*\+;:{}=\-_`~()。，！｀、～；：（）－／×？]/g, '');
   };
   candiateWords = function (strippedInput) {
-    return strippedInput.split(' ');
+    //return strippedInput.split(' ');
+    // return strippedInput.split('');//Chinese supported
+    return nodejieba.cut(strippedInput);//Chinese smart split
   };
 }.call(this);
diff --git a/lib/unfluff.js b/lib/unfluff.js
diff --git a/package.json b/package.json
@@ -1,8 +1,8 @@
 {
-  "name": "unfluff",
-  "version": "1.1.0",
+  "name": "node-article-extractor",
+  "version": "1.1.2",
   "description": "A web page content extractor",
-  "homepage": "https://github.com/ageitgey/node-unfluff",
+  "homepage": "https://github.com/ahkimkoo/node-article-extractor",
   "keywords": [
     "content extraction",
     "html",
@@ -12,14 +12,14 @@
     "body text"
   ],
   "author": {
-    "name": "Adam Geitgey",
-    "email": "ageitgey@gmail.com"
+    "name": "Cherokee Liu",
+    "email": "successage@gmail.com"
   },
   "repository": {
     "type": "git",
-    "url": "git://github.com/ageitgey/node-unfluff"
+    "url": "git://github.com/ahkimkoo/node-article-extractor"
   },
-  "bugs": "https://github.com/ageitgey/node-unfluff/issues",
+  "bugs": "https://github.com/ahkimkoo/node-article-extractor/issues",
   "engines": {
     "node": "0.8.x || 0.9.x || 0.10.x"
   },
@@ -30,26 +30,26 @@
     "test": "test"
   },
   "dependencies": {
-    "cheerio": "~0.17.0",
+    "cheerio": "~0.22.0",
+    "lodash": "~4.17.4",
+    "nodejieba": "^2.2.5",
     "optimist": "~0.6.1",
-    "lodash": "~2.4.1",
-    "xregexp": "~2.0.0"
+    "xregexp": "~3.2.0"
   },
   "devDependencies": {
-    "coffee-script-redux": "2.0.0-beta7",
     "commonjs-everywhere": "0.9.x",
-    "mocha": "~1.12.1",
+    "mocha": "~3.4.1",
     "scopedfs": "~0.1.0",
-    "semver": "~2.1.0",
-    "deep-equal": "~0.2.1"
+    "semver": "~5.3.0",
+    "deep-equal": "~1.0.1"
   },
   "scripts": {
     "test": "make test"
   },
   "licenses": [
     {
       "type": "Apache",
-      "url": "https://github.com/ageitgey/node-unfluff/blob/master/LICENSE"
+      "url": "https://github.com/ahkimkoo/node-article-extractor/blob/master/LICENSE"
     }
   ]
 }