ageitgey · danielgranat · Feb 24, 2015 · Feb 25, 2015 · Feb 25, 2015 · Feb 25, 2015
diff --git a/data/stopwords/stopwords-he.txt b/data/stopwords/stopwords-he.txt
diff --git a/fixtures/test_wikipedia1.json b/fixtures/test_wikipedia1.json
@@ -0,0 +1,11 @@
+{
+    "url": "http://en.wikipedia.org/wiki/Now_and_Then,_Here_and_There", 
+    "expected": {
+        "domain": "en.wikipedia.org", 
+        "title": "Now and Then, Here and There",
+        "cleaned_text": "SAN FRANCISCO (AP) \u2014 Steve Jobs, the mind behind the iPhone", 
+        "meta_favicon": "//bits.wikimedia.org/favicon/wikipedia.ico", 
+        "meta_lang": "en",
+        "image": "//upload.wikimedia.org/wikipedia/en/thumb/1/10/Now_and_Then_Here_and_There.png/230px-Now_and_Then_Here_and_There.png"
+    }
+}
diff --git a/lib/domainExtractor.js b/lib/domainExtractor.js
diff --git a/lib/domain_extractors/wikipedia.org.js b/lib/domain_extractors/wikipedia.org.js
diff --git a/lib/extractor.js b/lib/extractor.js
diff --git a/lib/unfluff.js b/lib/unfluff.js
diff --git a/src/domainExtractor.coffee b/src/domainExtractor.coffee
@@ -0,0 +1,42 @@
+path = require('path')
+fs = require('fs')
+_ = require('lodash')
+{XRegExp} = require('xregexp')
+
+cache = {}
+extension = __filename.substr(__filename.lastIndexOf(".")+1)
+
+getFilePath = (domain) ->
+  path.join(__dirname, "domain_extractors", "#{domain}.#{extension}")
+
+module.exports = domainExtractors = (url) ->
+  domains = extractDomains(url)
+  domainExtractor = null
+  _.each domains, (domain) ->
+    if cache.hasOwnProperty(domain)
+      domainExtractor = cache[domain]
+    else
+      filePath = getFilePath(domain)
+      if !fs.existsSync(filePath)
+        filePath = null
+      else
+        domainExtractor = require(filePath)  
+        cache[domain] = domainExtractor
+  return domainExtractor
+
+extractDomains = (url) ->
+  domainRegex = XRegExp('[a-zA-Z]*:*//(?<domain>[a-zA-Z0-9\\-\\.]+)/.*')
+  domains = []
+  domain = XRegExp.replace(url, domainRegex, '${domain}')
+  domains.push domain
+  splitDomain = domain.split('.')
+  # The idea of the subdomain is to try to match wikipedia.org from en.wikipedia.org.
+  # So the minimum parts to domain is 2. 
+  # Still the length of the text should be bigger then 2 characters, to avoid using only the TLD like co.il
+  _.each splitDomain, (subDomain,index) ->
+
+    if splitDomain.length - index >= 3 || (splitDomain.length - index == 3 && subDomain.length > 2 )
+      domain = domain.substr(subDomain.length+1)
+      domains.push domain
+
+  return domains
diff --git a/src/domain_extractors/wikipedia.org.coffee b/src/domain_extractors/wikipedia.org.coffee
@@ -0,0 +1,23 @@
+_ = require("lodash")
+
+module.exports =
+  image: (doc) ->
+    images = doc(".infobox img")
+
+    if images.length > 0 && images.first().attr('src')
+      return images.first().attr('src')
+
+  title: (doc) ->
+    titleElement = doc("title")
+    titleText = titleElement.text()
+
+    return null unless titleElement
+
+    usedDelimeter = false
+    _.each ["|", " - ", "»", ":"], (c) ->
+      if titleText.indexOf(c) >= 0 && !usedDelimeter
+        titlePieces = titleText.split(c)
+        titleText = titlePieces[0]
+        usedDelimeter = true
+
+    titleText.replace(/�/g, "").trim()
diff --git a/src/extractor.coffee b/src/extractor.coffee
@@ -1,10 +1,18 @@
 _ = require("lodash")
 stopwords = require("./stopwords")
 formatter = require("./formatter")
+domainsExtractor = require("./domainExtractor")
 
 module.exports =
   # Grab the title of an html doc (excluding junk)
-  title: (doc) ->
+  title: (doc,url) ->
+    if url
+      domainExtractor = domainsExtractor(url)
+      if domainExtractor && domainExtractor.title != undefined
+        title = domainExtractor.title(doc)
+        return title unless ! title
+
+
     titleElement = doc("meta[property='og:title']")
     titleText = titleElement.attr("content") if titleElement
 
@@ -31,8 +39,14 @@ module.exports =
       ""
 
   # Grab an image for the page
-  image: (doc) ->
-    images = doc("meta[property='og:image'], meta[itemprop=image], meta[name='twitter:image:src'], meta[name='twitter:image'], meta[name='twitter:image0']")
+  image: (doc,url) ->
+    if url
+      domainExtractor = domainsExtractor(url)
+      if domainExtractor && domainExtractor.image != undefined
+        image = domainExtractor.image(doc)
+        return image unless ! image
+
+    images = doc("meta[property='og:image'], meta[itemprop=image], meta[name='twitter:image:src'], meta[name='twitter:image'], meta[name='twitter:image0'], .infobox img[src]")
 
     if images.length > 0 && images.first().attr('content')
       return images.first().attr('content')

diff --git a/src/unfluff.coffee b/src/unfluff.coffee
@@ -5,16 +5,17 @@ cleaner = require("./cleaner")
 module.exports = unfluff = (html, language) ->
   doc = cheerio.load(html)
   lng = language || extractor.lang(doc)
+  url = extractor.canonicalLink(doc) || extractor.favicon(doc)
 
   pageData =
-    title: extractor.title(doc)
+    title: extractor.title(doc,url)
     favicon: extractor.favicon(doc)
     description: extractor.description(doc)
     keywords: extractor.keywords(doc)
     lang: lng
     canonicalLink: extractor.canonicalLink(doc)
     tags: extractor.tags(doc)
-    image: extractor.image(doc)
+    image: extractor.image(doc,url)
 
   # Step 1: Clean the doc
   cleaner(doc)
@@ -32,7 +33,8 @@ module.exports = unfluff = (html, language) ->
 unfluff.lazy = (html, language) ->
   title: () ->
     doc = getParsedDoc.call(this, html)
-    @title_ ?= extractor.title(doc)
+    url = extractor.canonicalLink(doc) || extractor.favicon(doc)
+    @title_ ?= extractor.title(doc,url)
 
   favicon: () ->
     doc = getParsedDoc.call(this, html)
@@ -60,7 +62,8 @@ unfluff.lazy = (html, language) ->
 
   image: () ->
     doc = getParsedDoc.call(this, html)
-    @image_ ?= extractor.image(doc)
+    url = extractor.canonicalLink(doc) || extractor.favicon(doc)
+    @image_ ?= extractor.image(doc,url)
 
   videos: () ->
     return @videos_ if @videos_?

diff --git a/test/domainExtractor.coffee b/test/domainExtractor.coffee
@@ -0,0 +1,14 @@
+suite 'DomainExtractor', ->
+  domainExtractor = require("../src/domainExtractor")
+
+  test 'exists', ->
+    ok domainExtractor
+
+  test 'en.wikipedia.com', ->
+  	ok domainExtractor('http://en.wikipedia.org/wiki/Thomas_Edison')
+
+  test 'he.wikipedia.com', ->
+    ok domainExtractor('http://he.wikipedia.org/wiki/Thomas_Edison')
+
+  test 'something.he.wikipedia.com', ->
+    ok domainExtractor('http://he.wikipedia.org/wiki/Thomas_Edison')
diff --git a/test/unfluff.coffee b/test/unfluff.coffee
@@ -105,6 +105,9 @@ suite 'Unfluff', ->
     checkFixture('polygon' , ['image'])
     checkFixture('theverge1' , ['image'])
 
+  test 'using domain extractor', ->
+    checkFixture('wikipedia1' , ['image','title'])
+
   test 'gets cleaned text - Polygon', ->
     checkFixture('polygon' , ['cleaned_text', 'title', 'link', 'description', 'lang', 'favicon'])