Skip to content

Commit

Permalink
Merge pull request #353 from spencermountain/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
spencermountain authored Apr 22, 2020
2 parents c37f07f + 7e97b36 commit be66179
Show file tree
Hide file tree
Showing 142 changed files with 13,691 additions and 900 deletions.
105 changes: 59 additions & 46 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
```js
const wtf = require('wtf_wikipedia')

wtf.fetch('Toronto Raptors').then(doc => {
wtf.fetch('Toronto Raptors').then((doc) => {
doc.sentences(0).text()
//'The Toronto Raptors are a Canadian professional basketball team ...'

Expand Down Expand Up @@ -107,18 +107,15 @@ the default json output is [really verbose](https://observablehq.com/@spencermou

```js
// get just the links:
doc.links().map(link => link.json())
doc.links().map((link) => link.json())
//[{ page: 'Theatrical superstitions', text: 'supersitions' }]

// just the images:
doc.images(0).json()
// { file: 'Image:Duveneck Whistling Boy.jpg', url: 'https://commons.wiki...' }

// json for a particular section:
doc
.sections('see also')
.links(0)
.json()
doc.sections('see also').links(0).json()
// { page: 'Slide Whistle' }
```

Expand All @@ -138,9 +135,9 @@ run it on the client-side:
<script src="https://unpkg.com/wtf_wikipedia"></script>
<script>
// follow a redirect:
wtf.fetch('On a Friday', function(err, doc) {
wtf.fetch('On a Friday', function (err, doc) {
let members = doc.infobox().get('current members')
members.links().map(l => l.page())
members.links().map((l) => l.page())
//['Thom Yorke', 'Jonny Greenwood', 'Colin Greenwood'...]
})
</script>
Expand All @@ -153,7 +150,7 @@ run it on the client-side:
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221837-0d142480-ffb8-11e9-9d30-90669f1b897c.png"/>
</div>

### full wikipedia dumps
## full wikipedia dumps

With this library, in conjunction with [dumpster-dive](https://github.com/spencermountain/dumpster-dive), you can parse the whole english wikipedia in an aftertoon.

Expand Down Expand Up @@ -182,29 +179,51 @@ npm install -g dumpster-dive
- [Fetching a list of pages](https://observablehq.com/@spencermountain/parsing-a-list-of-wikipedia-articles)
- [Parsing COVID outbreak table](https://observablehq.com/@spencermountain/parsing-wikipedias-coronavirus-outbreak-data?collection=@spencermountain/wtf_wikipedia)

### Plugins
<!-- spacer -->
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<div align="center">
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221824-09809d80-ffb8-11e9-9ef0-6ed3574b0ce8.png"/>
</div>

| | |
| ------------------------------ | --------------- |
| [html](./plugins/html) | output html |
| [markdown](./plugins/markdown) | output markdown |
| [latex](./plugins/latex) | output latex |
## Plugins

| | |
| ------------------------------ | -------------------------------------- |
| [i18n](./plugins/i18n) | improve multilingual template coverage |
| [classify](./plugins/classify) | is the article about a person? |
| [summary](./plugins/summary) | small description text |
these add all sorts of new functionality:

| | |
| ------------------------------ | ---------------------------------- |
| [category](./plugins/category) | parse all articles in a category |
| [image](./plugins/image) | additional methods for `.images()` |
```js
wtf.extend(require('wtf-plugin-classify'))
wtf.fetch('Toronto Raptors').then((doc) => doc.classify())
// 'Organization/SportsTeam'

wtf.extend(require('wtf-plugin-summary'))
wtf.fetch('Pulp Fiction').then((doc) => doc.summary())
// 'a 1994 American crime film'

wtf.extend(require('wtf-plugin-person'))
wtf.fetch('David Bowie').then((doc) => doc.birthDate())
// {year:1947, date:8, month:1}

wtf.extend(require('wtf-plugin-i18n'))
wtf.fetch('Ziggy Stardust', 'fr').then((doc) => {
doc.infobox().json()
//{ nom:{text:"Ziggy Stardust"}, oeuvre:{text:"The Rise and Fall of Ziggy Stardust"} }
})
```

| | |
| ----------------------------------------------------- | ----------------------------- |
| [wtf-mlb](https://github.com/spencermountain/wtf-mlb) | baseball team & season parser |
| [wtf-nhl](https://github.com/spencermountain/wtf-nhl) | hockey team & season parser |
| **Plugin** | |
| ---------------------------------------------------------- | --------------------------------------- |
| [classify](./plugins/classify) | person/place/thing |
| [summary](./plugins/summary) | short description text |
| [person](./plugins/person) | birth/death information |
| [category](./plugins/category) | parse all articles in a category |
| [i18n](./plugins/i18n) | improves multilingual template coverage |
| [wtf-mlb](https://github.com/spencermountain/wtf-mlb) | fetch baseball data |
| [wtf-nhl](https://github.com/spencermountain/wtf-nhl) | fetch hockey data |
| [nsfw](https://github.com/spencermountain/wtf-plugin-nsfw) | flag sexual/graphic/adult articles |
| [image](./plugins/image) | additional methods for `.images()` |
| [html](./plugins/html) | output html |
| [wikitext](./plugins/wikitext) | output wikitext |
| [markdown](./plugins/markdown) | output markdown |
| [latex](./plugins/latex) | output latex |

<div align="right">
<a href="https://observablehq.com/@spencermountain/wtf-wikipedia-plugins">plugin docs</a>
Expand Down Expand Up @@ -308,7 +327,7 @@ wtf(txt)
```javascript
let str = `Whistling is featured in a number of television shows, such as [[Lassie (1954 TV series)|''Lassie'']], and the title theme for ''[[The X-Files]]''.`
let doc = wtf(str)
doc.links().map(l => l.page())
doc.links().map((l) => l.page())
// [ 'Lassie (1954 TV series)', 'The X-Files' ]
```

Expand All @@ -328,12 +347,8 @@ var text = wtf(wiki).text()
a section is a heading _'==Like This=='_

```js
wtf(page)
.sections(1)
.children() //traverse nested sections
wtf(page)
.sections('see also')
.remove() //delete one
wtf(page).sections(1).children() //traverse nested sections
wtf(page).sections('see also').remove() //delete one
```

#### **doc.sentences()**
Expand All @@ -343,7 +358,6 @@ s = wtf(page).sentences(4)
s.links()
s.bolds()
s.italics()
s.dates() //structured date templates
```

#### **doc.categories()**
Expand Down Expand Up @@ -383,7 +397,7 @@ doc.sentences(0).text() // 'Tony Hawk est un skateboarder professionnel et un ac
let docs = wtf.fetch(['Whistling', 2983], { follow_redirects: false })

// article from german wikivoyage
wtf.fetch('Toronto', { lang: 'de', wiki: 'wikivoyage' }).then(doc => {
wtf.fetch('Toronto', { lang: 'de', wiki: 'wikivoyage' }).then((doc) => {
console.log(doc.sentences(0).text()) // 'Toronto ist die Hauptstadt der Provinz Ontario'
})
```
Expand Down Expand Up @@ -437,10 +451,10 @@ The wikipedia api is [pretty welcoming](https://www.mediawiki.org/wiki/API:Etiqu
```js
wtf
.fetch(['Royal Cinema', 'Aldous Huxley'], 'en', {
'Api-User-Agent': '[email protected]'
'Api-User-Agent': '[email protected]',
})
.then(docList => {
let links = docList.map(doc => doc.links())
.then((docList) => {
let links = docList.map((doc) => doc.links())
console.log(links)
})
```
Expand Down Expand Up @@ -515,7 +529,6 @@ wtf
- **.links()** -
- **.bolds()** -
- **.italics()** -
- **.dates()** -
- **.json()** -

### Image
Expand Down Expand Up @@ -569,10 +582,10 @@ wtf
you can add new methods to any class of the library, with `wtf.extend()`

```js
wtf.extend(models => {
wtf.extend((models) => {
// throw this method in there...
models.Doc.prototype.isPerson = function() {
return this.categories().find(cat => cat.match(/people/))
models.Doc.prototype.isPerson = function () {
return this.categories().find((cat) => cat.match(/people/))
}
})

Expand Down Expand Up @@ -621,15 +634,15 @@ It can usually be found by visiting `http://mywiki.com/api.php`
to fetch pages from a 3rd-party wiki:

```js
wtf.fetch('Kermit', { domain: 'muppet.fandom.com' }).then(doc => {
wtf.fetch('Kermit', { domain: 'muppet.fandom.com' }).then((doc) => {
console.log(doc.text())
})
```

some wikis will change the path of their API, from `./api.php` to elsewhere. If your api has a different path, you can set it like so:

```js
wtf.fetch('2016-06-04_-_J.Fernandes_@_FIL,_Lisbon', { domain: 'www.mixesdb.com', path: 'db/api.php' }).then(doc => {
wtf.fetch('2016-06-04_-_J.Fernandes_@_FIL,_Lisbon', { domain: 'www.mixesdb.com', path: 'db/api.php' }).then((doc) => {
console.log(doc.templates('player'))
})
```
Expand Down
76 changes: 51 additions & 25 deletions builds/wtf_wikipedia-client.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* wtf_wikipedia 8.1.2 MIT */
/* wtf_wikipedia 8.2.0 MIT */
(function (global, factory) {
typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
typeof define === 'function' && define.amd ? define(factory) :
Expand Down Expand Up @@ -3060,7 +3060,6 @@
text: true,
links: true,
formatting: true,
dates: true,
numbers: true
};

Expand Down Expand Up @@ -3092,10 +3091,6 @@
data.formatting = s.data.fmt;
}

if (options.dates && s.data.dates !== undefined) {
data.dates = s.data.dates;
}

return data;
};

Expand Down Expand Up @@ -3166,10 +3161,6 @@
dates: function dates(n) {
var arr = [];

if (this.data && this.data.dates) {
arr = this.data.dates || [];
}

if (typeof n === 'number') {
return arr[n];
}
Expand Down Expand Up @@ -3208,9 +3199,8 @@
var abbrev_reg = new RegExp("(^| |')(" + abbreviations.join('|') + ")[.!?] ?$", 'i');
var acronym_reg = new RegExp("[ |.|'|[][A-Z].? *?$", 'i');
var elipses_reg = new RegExp('\\.\\.\\.* +?$');
var hasWord = new RegExp('[a-zа-яぁ-ゟ][a-zа-яぁ-ゟ゠-ヿ]', 'iu'); // 3040-309F : hiragana
// 30A0-30FF : katakana
//turn a nested array into one array
var circa_reg = / c\.$/;
var hasWord = new RegExp('[a-zа-яぁ-ゟ][a-zа-яぁ-ゟ゠-ヿ]', 'iu'); //turn a nested array into one array

var flatten = function flatten(arr) {
var all = [];
Expand Down Expand Up @@ -3292,7 +3282,7 @@


var isSentence = function isSentence(hmm) {
if (hmm.match(abbrev_reg) || hmm.match(acronym_reg) || hmm.match(elipses_reg)) {
if (hmm.match(abbrev_reg) || hmm.match(acronym_reg) || hmm.match(elipses_reg) || hmm.match(circa_reg)) {
return false;
} //too short? - no consecutive letters

Expand Down Expand Up @@ -4842,7 +4832,10 @@
'pp', 'pp-move-indef', 'pp-semi-indef', 'pp-vandalism', //https://en.wikipedia.org/wiki/Template:R
'r', //out-of-scope still - https://en.wikipedia.org/wiki/Template:Tag
'#tag', //https://en.wikipedia.org/wiki/Template:Navboxes
'navboxes', 'reflist', 'ref-list', 'div col', // 'authority control',
// 'navboxes',
// 'reflist',
// 'ref-list',
'div col', // 'authority control',
//https://en.wikipedia.org/wiki/Template:Citation_needed
// 'better source',
// 'citation needed',
Expand Down Expand Up @@ -6330,17 +6323,51 @@
mw: 'mediawiki'
};
var parsers$1 = {
//https://en.wikipedia.org/wiki/Template:About
// https://en.wikipedia.org/wiki/Template:About
about: function about(tmpl, list) {
var obj = parse$3(tmpl); // obj.pos = r.title //not working

var obj = parse$3(tmpl);
list.push(obj);
return '';
},
//https://en.wikipedia.org/wiki/Template:Main
// https://en.wikipedia.org/wiki/Template:Main
main: function main(tmpl, list) {
var obj = parse$3(tmpl); // obj.pos = r.title //not working

var obj = parse$3(tmpl);
list.push(obj);
return '';
},
// https://en.wikipedia.org/wiki/Template:Main_list
'main list': function mainList(tmpl, list) {
var obj = parse$3(tmpl);
list.push(obj);
return '';
},
// https://en.wikipedia.org/wiki/Template:See
'see': function see(tmpl, list) {
var obj = parse$3(tmpl);
list.push(obj);
return '';
},
// https://en.wikipedia.org/wiki/Template:For
'for': function _for(tmpl, list) {
var obj = parse$3(tmpl);
list.push(obj);
return '';
},
// https://en.wikipedia.org/wiki/Template:Further
'further': function further(tmpl, list) {
var obj = parse$3(tmpl);
list.push(obj);
return '';
},
// same as "further" (but this name is still in use)
'further information': function furtherInformation(tmpl, list) {
var obj = parse$3(tmpl);
list.push(obj);
return '';
},
// https://en.wikipedia.org/wiki/Template:Listen
'listen': function listen(tmpl, list) {
var obj = parse$3(tmpl);
list.push(obj);
return '';
},
Expand Down Expand Up @@ -6455,8 +6482,7 @@
},
//https://en.wikipedia.org/wiki/Template:See_also
'see also': function seeAlso(tmpl, list) {
var data = parse$3(tmpl); // data.pos = r.title //not working

var data = parse$3(tmpl);
list.push(data);
return '';
},
Expand Down Expand Up @@ -8796,7 +8822,7 @@

var category = fetchCategory;

var _version = '8.1.2';
var _version = '8.2.0';

var wtf = function wtf(wiki, options) {
return _01Document(wiki, options);
Expand All @@ -8815,6 +8841,7 @@
Reference: Reference_1,
Table: Table_1,
Template: Template_1,
http: client,
wtf: wtf
};

Expand All @@ -8841,4 +8868,3 @@
return src;

})));
//# sourceMappingURL=wtf_wikipedia-client.js.map
2 changes: 1 addition & 1 deletion builds/wtf_wikipedia-client.js.map

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion builds/wtf_wikipedia-client.min.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion builds/wtf_wikipedia-client.mjs

Large diffs are not rendered by default.

Loading

0 comments on commit be66179

Please sign in to comment.