-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathparse.py
202 lines (178 loc) · 6.81 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from lxml import etree
from progressbar import ProgressBar, Percentage, Bar
from dateutil import parser, tz
import re
import unidecode
import datetime
import pytz
import requests
import os
import time
import codecs
DEBUG = False
FILENAME = 'my-wordpress-export.xml'
tree = etree.parse(FILENAME)
namespaces = tree.getroot().nsmap
def slugify(string):
if string is not None:
string = unidecode.unidecode(string).lower()
return re.sub(r'\W+', '-', string)
else:
return ""
class Post:
""" Ommitted from the XML standard:
pubDate
guid
excerpt:encoded
post_date_gmt
post_type
post_password
is_sticky
"""
def __init__(self, id=None, title=None):
self.id = id
self.title = title
self.description = None
self.creator = None
self.body = None
self.url = None
self.post_date = datetime.datetime.now()
self.comment_status = "open"
self.ping_status = "open"
self.slug = slugify(title)
self.status = "publish"
self.parent = None
self.menu_order = 0
self.tags = []
self.categories = []
self.comments = []
def adjust_paths(self, attachments=None, prefix=''):
if prefix is not '' and not prefix.endswith('/'):
print "[ERRR] Your attachment prefix does not end in a trailing slash"
return False
if self.body is not None and attachments is not None:
for attachment in attachments:
if attachment.url in self.body:
new_url = prefix + attachment.url.split('/')[-1]
self.body = self.body.replace(attachment.url, new_url)
if DEBUG:
print "[DEBG] Replaced " + attachment.url + " with " + new_url
def fix_paragraphs(self):
fixed = self.body.replace('\n', '</p><p>')
fixed = '<p>' + fixed + '</p>'
fixed = fixed.replace('</p><p></p><p>', '</p><p>')
self.body = fixed
def fix_more(self):
fixed = self.body.replace('<!--more-->', '[[MORE]]')
self.body = fixed
class Attachment:
def __init__(self, id=None, title=None, url=None):
self.id = id
self.title = title
self.url = url
def download(self, path='attachments'):
if self.url is not None:
title = self.url.split('/')[-1]
attachment = requests.get(self.url)
if attachment.status_code == requests.codes.ok:
f = open(os.path.join(path, title), 'wb')
f.write(attachment.content)
f.close()
else:
attachment.raise_for_status()
def find_blog(tree):
if tree.find(".//title") is not None:
title = tree.find(".//title").text
url = tree.find(".//link").text
description = tree.find(".//description").text
exported = tree.find(".//pubDate").text
language = tree.find(".//language").text
print "Found %s" % title
def find_authors(tree):
author_elems = tree.findall(".//wp:author", namespaces=namespaces)
authors = []
for author_elem in author_elems:
login = author_elem.find("./wp:author_login", namespaces=namespaces)
email = author_elem.find("./wp:author_email", namespaces=namespaces)
username = author_elem.find("./wp:author_display_name", namespaces=namespaces)
first_name = author_elem.find("./wp:author_first_name", namespaces=namespaces)
last_name = author_elem.find("./wp:author_last_name", namespaces=namespaces)
authors.append({
'login': login,
'email': email,
'username': username,
'first_name': first_name,
'last_name': last_name
})
if len(authors) > 0:
print "Found %i authors" % len(authors)
return authors
else:
print "[WARN] Found no authors!"
return False
def find_tags(tree):
tag_elems = tree.findall(".//wp:tag", namespaces=namespaces)
tags = []
for tag_elem in tag_elems:
slug = tag_elem.find("./wp:tag_slug", namespaces=namespaces)
name = tag_elem.find("./wp:tag_name", namespaces=namespaces)
tags.append({
'slug': slug,
'name': name
})
if len(tags) > 0:
print "Found %i tags" % len(tags)
return tags
else:
print "[WARN] Found no tags!"
return False
def find_posts(tree, published=True):
if published:
xpath = ".//item[wp:post_type='post' and wp:status='publish']"
item_elems = tree.xpath(xpath, namespaces=namespaces)
else:
item_elems = tree.findall(".//item[wp:post_type='post']", namespaces=namespaces)
posts = []
for post_elem in item_elems:
post = Post(unicode(post_elem.find("./wp:post_id", namespaces=namespaces).text), unicode(post_elem.find("./title").text))
post.url = unicode(post_elem.find("./link").text)
post.body = unicode(post_elem.find("./content:encoded", namespaces=namespaces).text)
post_stamp = parser.parse(post_elem.find("./wp:post_date", namespaces=namespaces).text)
local = pytz.timezone("America/Chicago")
local_stamp = local.localize(post_stamp, is_dst=None)
utc_stamp = local_stamp.astimezone(pytz.utc)
post.post_date = utc_stamp
tag_elems = post_elem.xpath("./category[@domain='post_tag']")
tags = []
if tag_elems is not None:
for tag in tag_elems:
tags.append(tag.get('nicename'))
post.tags = tags
posts.append(post)
if len(posts) > 0:
print "Found %i posts" % len(posts)
return posts
else:
print "[WARN] Found no posts!"
return False
def find_attachments(tree, download=True):
xpath = ".//item[wp:post_type='attachment']"
attachment_elems = tree.xpath(xpath, namespaces=namespaces)
attachments = []
for attachment_elem in attachment_elems:
attachment = Attachment(attachment_elem.find("./wp:post_id", namespaces=namespaces).text, unicode(attachment_elem.find("./title").text), attachment_elem.find("./wp:attachment_url", namespaces=namespaces).text)
attachments.append(attachment)
if len(attachments) > 0:
print "Found %i attachments" % len(attachments)
if download:
print "Downloading %i attachments" % len(attachments)
progress = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(attachments)).start()
for i, attachment in enumerate(attachments):
attachment.download('attachments')
progress.update(i)
progress.finish()
print "Downloaded %i attachments" % len(attachments)
return attachments
else:
print "[WARN] Found no attachments!"
return False