diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..02e7956 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) Ed Summers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 8963d94..f0eb76a 100644 --- a/README.md +++ b/README.md @@ -3,24 +3,124 @@ microdata [![Build Status](https://secure.travis-ci.org/edsu/microdata.png)](http://travis-ci.org/edsu/microdata) -microdata.py is a small utility library for extracting -[HTML5 Microdata](http://dev.w3.org/html5/md/) from -HTML. It depends on -[html5lib](http://code.google.com/p/html5lib/) -to do the heavy lifting of building the DOM. -For more about HTML5 Microdata check out Mark Pilgrim's -[chapter](http://diveintohtml5.org/extensibility.html) on on it in -[Dive Into HTML5](http://diveintohtml5.org/). +microdata.py is a small utility library for extracting [HTML5 +Microdata](http://dev.w3.org/html5/md/) from HTML. It depends on +[html5lib](http://code.google.com/p/html5lib/) to do the heavy lifting of +building the DOM. For more about HTML5 Microdata check out Mark Pilgrim's +[chapter](http://diveintohtml5.org/extensibility.html) on on it in [Dive Into +HTML5](http://diveintohtml5.org/). Command Line ------------ -When you install microdata.py via pip it will be made available on the command -line too: +When you install microdata via pip it will also install a command line utility: - % microdata.py http://www.wdl.org/en/item/1/ +``` +$ microdata https://www.youtube.com/watch?v=dQw4w9WgXcQ +https://www.youtube.com/watch?v=dQw4w9WgXcQ +{ + "items": [ + { + "type": [ + "http://schema.org/VideoObject" + ], + "properties": { + "url": [ + "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + ], + "name": [ + "Rick Astley - Never Gonna Give You Up (Official Music Video)" + ], + "description": [ + "The official video for \u00e2\u20ac\u0153Never Gonna Give You Up\u00e2\u20ac\ufffd by Rick Astley \u00e2\u20ac\u0153Never Gonna Give You Up\u00e2\u20ac\ufffd was a global smash on its release in July 1987, topping the charts ..." + ], + "paid": [ + "False" + ], + "channelId": [ + "UCuAXFkgsw1L7xaCfnd5JJOw" + ], + "videoId": [ + "dQw4w9WgXcQ" + ], + "duration": [ + "PT3M33S" + ], + "unlisted": [ + "False" + ], + "author": [ + { + "type": [ + "http://schema.org/Person" + ], + "properties": { + "url": [ + "http://www.youtube.com/channel/UCuAXFkgsw1L7xaCfnd5JJOw" + ], + "name": [ + "" + ] + } + } + ], + "thumbnailUrl": [ + "https://i.ytimg.com/vi/dQw4w9WgXcQ/maxresdefault.jpg" + ], + "thumbnail": [ + { + "type": [ + "http://schema.org/ImageObject" + ], + "properties": { + "url": [ + "https://i.ytimg.com/vi/dQw4w9WgXcQ/maxresdefault.jpg" + ], + "width": [ + "1280" + ], + "height": [ + "720" + ] + } + } + ], + "embedUrl": [ + "https://www.youtube.com/embed/dQw4w9WgXcQ" + ], + "playerType": [ + "HTML5 Flash" + ], + "width": [ + "1280" + ], + "height": [ + "720" + ], + "isFamilyFriendly": [ + "true" + ], + "regionsAllowed": [ + "AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,AS,AT,AU,AW,AX,AZ,BA,BB,BD,BE,BF,BG,BH,BI,BJ,BL,BM,BN,BO,BQ,BR,BS,BT,BV,BW,BY,BZ,CA,CC,CD,CF,CG,CH,CI,CK,CL,CM,CN,CO,CR,CU,CV,CW,CX,CY,CZ,DE,DJ,DK,DM,DO,DZ,EC,EE,EG,EH,ER,ES,ET,FI,FJ,FK,FM,FO,FR,GA,GB,GD,GE,GF,GG,GH,GI,GL,GM,GN,GP,GQ,GR,GS,GT,GU,GW,GY,HK,HM,HN,HR,HT,HU,ID,IE,IL,IM,IN,IO,IQ,IR,IS,IT,JE,JM,JO,JP,KE,KG,KH,KI,KM,KN,KP,KR,KW,KY,KZ,LA,LB,LC,LI,LK,LR,LS,LT,LU,LV,LY,MA,MC,MD,ME,MF,MG,MH,MK,ML,MM,MN,MO,MP,MQ,MR,MS,MT,MU,MV,MW,MX,MY,MZ,NA,NC,NE,NF,NG,NI,NL,NO,NP,NR,NU,NZ,OM,PA,PE,PF,PG,PH,PK,PL,PM,PN,PR,PS,PT,PW,PY,QA,RE,RO,RS,RU,RW,SA,SB,SC,SD,SE,SG,SH,SI,SJ,SK,SL,SM,SN,SO,SR,SS,ST,SV,SX,SY,SZ,TC,TD,TF,TG,TH,TJ,TK,TL,TM,TN,TO,TR,TT,TV,TW,TZ,UA,UG,UM,US,UY,UZ,VA,VC,VE,VG,VI,VN,VU,WF,WS,YE,YT,ZA,ZM,ZW" + ], + "interactionCount": [ + "1141688870" + ], + "datePublished": [ + "2009-10-24" + ], + "uploadDate": [ + "2009-10-24" + ], + "genre": [ + "Music" + ] + } + } + ] +} +``` -This will print out the JSON for items extracted from the supplied URL. Library ------- diff --git a/microdata.py b/microdata.py index 9cdb83c..19581a1 100755 --- a/microdata.py +++ b/microdata.py @@ -1,16 +1,32 @@ #!/usr/bin/env python import sys +import json import html5lib from collections import defaultdict +from urllib.request import urlopen, Request +USER_AGENT = "microdata.py " -try: - import json -except ImportError: - import simplejson as json +def main(): + + if len(sys.argv) < 2: + print("Usage: microdata ") + sys.exit(1) + + for url in sys.argv[1:]: + sys.stderr.write(url + "\n") + + microdata = {} + microdata['items'] = items = [] + + req = Request(url, headers={"User-Agent": USER_AGENT}) + for item in get_items(urlopen(req)): + items.append(item.json_dict()) + + print(json.dumps(microdata, indent=2)) def get_items(location, encoding=None): """ @@ -230,22 +246,6 @@ def _make_item(e): if __name__ == "__main__": - try: - from urllib.request import urlopen - except ImportError: - from urllib import urlopen - - if len(sys.argv) < 2: - print("Usage: %s URL [...]" % sys.argv[0]) - sys.exit(1) - - for url in sys.argv[1:]: - sys.stderr.write(url + "\n") - - microdata = {} - microdata['items'] = items = [] + main() - for item in get_items(urlopen(url)): - items.append(item.json_dict()) - print(json.dumps(microdata, indent=2)) diff --git a/setup.py b/setup.py index b3b779f..92a3bfe 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,19 @@ from setuptools import setup -import sys - -extra = {} setup( name = 'microdata', - version = '0.7.1', + version = '0.8.0', description = "html5lib extension for parsing microdata", author = "Ed Summers", author_email = "ehs@pobox.com", url = "http://github.com/edsu/microdata", + python_requires=">=3.3", py_modules = ['microdata'], - scripts = ['microdata.py'], test_suite = 'test', install_requires = ['html5lib>=0.999999999'], - **extra + entry_points = { + "console_scripts": [ + "microdata = microdata:main" + ] + } ) diff --git a/test.py b/test.py index 00bcae9..bf1a6c6 100644 --- a/test.py +++ b/test.py @@ -1,8 +1,4 @@ -try: - import json -except ImportError: - import simplejson as json - +import json import unittest from microdata import get_items, Item, URI