/[ascend]/trunk/tools/mediawiki/html2mediawiki/striphf.py
ViewVC logotype

Contents of /trunk/tools/mediawiki/html2mediawiki/striphf.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2198 - (show annotations) (download) (as text)
Thu May 13 14:21:03 2010 UTC (12 years, 3 months ago) by jpye
File MIME type: text/x-python
File size: 8930 byte(s)
Upload
1 #!/usr/bin/env python
2 # Filter to strip the header and footer stuff from the cached page
3
4 from BeautifulSoup import *
5 import sys
6
7 def replace_templates(soup):
8 for t in soup.findAll("div",{"id":"task","class":"notice metadata"}):
9 t.replaceWith(NavigableString("{{task}}"))
10
11 def strip_contents(soup):
12 c = soup.find('table',{'id':'toc','class':'toc','summary':'Contents'})
13 if c:
14 c.extract()
15
16 def strip_wiki_comments(soup):
17 msg1 = "NewPP limit report"
18 l1 = len(msg1)
19 msg2 = "Saved in parser cache"
20 l2 = len(msg2)
21 #print "STRIPPING WIKI COMMENTS"
22 def co(tag):
23 if isinstance(tag, Comment):
24 if tag.string.strip()[0:l1] == msg1 or tag.string.strip()[0:l2]==msg2:
25 #print "COMMENT:",tag.string.strip()
26 return True
27 return False
28 for c in soup.findAll(text=co):
29 c.extract()
30
31
32 def strip_script(soup):
33 for s in soup.findAll('script'):
34 s.extract()
35
36 def strip_highlight(soup):
37 for a1 in soup.findAll('p'):
38 if a1.find('style',{'type':"text/css"}):
39 n1 = a1.nextSibling
40 if str(n1.string).strip() != "/* Highlighting theme definition: */":
41 #print "NO MATCH"
42 sys.exit(1)
43 n2 = n1.nextSibling
44 #print "REMOVING",str(a1)
45 a1.extract()
46 #print "REMOVING N1",str(n1)
47 n1.extract()
48 n3 = n2.nextSibling
49 #print "REMOVING N2",str(n2)
50 n2.extract()
51 n4= n3.nextSibling
52 #print "REMOVING N3",str(n3)
53 n3.extract()
54 pre = n4.nextSibling
55 #print "REMOVING N4",str(n4)
56 n4.extract()
57
58 if pre.name != 'pre':
59 #print "ERROR parsing syntax-highlighting:",pre
60 sys.exit(1)
61 for x in pre.findAll('b',{'style':True}):
62 x1 = NavigableString(str(x.string))
63 x.replaceWith(x1)
64
65 for x in pre.findAll('span',{'class':True}):
66 x1 = NavigableString(str(x.renderContents()))
67 x.replaceWith(x1)
68
69 t = Tag(soup,"source",[("lang",'a4c')])
70 t.insert(0, NavigableString(str(pre.renderContents()).strip()))
71 pre.replaceWith(t)
72
73 def strip_anchors(soup):
74 for a1 in soup.findAll('a',{'name':True}):
75 #print "ANCHOR:",a1
76 a1.extract()
77
78 def wikify_headings(soup):
79 for h in soup.findAll(['h1','h2','h3','h4','h5','h6']):
80 if not h.find('span',{'class':'mw-headline'}):
81 #print "HEADING: SKIPPING:",h
82 continue
83 #print "HEADING:",h
84 level = int(str(h.name)[1:])
85 h2 = NavigableString("\n" + "="*level + h.span.renderContents() + "="*level)
86 h.replaceWith(h2)
87
88 def wikify_paragraphs(soup):
89 for p in soup.findAll('p'):
90 #print "PARA",str(p)
91 if p.renderContents() is None:
92 p.replaceWith(NavigableString("\n"))
93 else:
94 p.replaceWith(NavigableString("\n" + p.renderContents()))
95
96 def strip_printfooter(soup):
97 soup.find('div',{'class':'printfooter'}).extract()
98
99 def strip_wikicomments(soup):
100 pass
101
102 def wikify_categories(soup):
103 cats = soup.find("div",{"id":"catlinks"})
104 if not cats:
105 return
106 r2 = re.compile("/[A-Z][a-z_0-9-]*")
107 cc = []
108 for a in cats.findAll("a"):
109 if str(a['href']) == "/Special:Categories":
110 #print "CATEGORIES LINK ignored"
111 a.extract()
112 elif r2.match(a['href']):
113 t = NavigableString("[[" + a['href'][1:] + "]]\n")
114 #print " categ:",t.strip()
115 cc.append(t)
116 #print "CATS:",cc
117 #cats.replace(cc)
118 for c in cc:
119 cats.parent.append(c)
120 cats.extract()
121
122 def wikify_images(soup):
123 for a in soup.findAll("a",{'class':'image'}):
124 if a.img:
125 if a.img['alt'][0:6] == "Image:":
126 #print "IMG1",a.img['alt'][6:]
127 a1 = NavigableString("[[Image:" + a.img['alt'][6:] + "]]")
128 #print "-->",a1
129 a.replaceWith(a1)
130 elif a['href'][0:6] == "/File:":
131 #print "IMG",a['href'][6:]
132 a1 = NavigableString("[[Image:" + a['href'][6:] + "]]")
133 a.replaceWith(a1)
134 #print "-->",a1
135 else:
136 sys.stderr.write("CAN'T PROCESS IMAGE LINK %s\n" % str(a))
137
138 def wikify_math(soup):
139 for img in soup.findAll("img",{'class':'tex'}):
140 s = "<math>" + img['alt'] + "</math>"
141 #print "MATH:",s
142 img1 = NavigableString(s)
143 img.replaceWith(img1)
144 #img.replaceWith(NavigableText(s))
145
146 def wikify_indents(soup):
147 for dl in soup.findAll("dl"):
148 s = ""
149 for dd in dl.findAll("dd"):
150 s += ":" + dd.renderContents() + "\n"
151 dl1 = NavigableString(s)
152 dl.replaceWith(dl1)
153
154 def wikify_links(soup):
155 rr1 = re.compile(" ")
156 def linkified(s):
157 s = rr1.sub("_",s)
158 s = s[0:1].upper() + s[1:]
159 return s
160
161 r = re.compile("^http://")
162 r2 = re.compile("/[A-Z][a-z_0-9-]*")
163 r3 = re.compile(r"^http://ascendcode.cheme.cmu.edu/viewvc.cgi/code/(.*)$");
164 r3trunk = re.compile(r"trunk/(.*)\?view=markup$")
165 r3branch = re.compile(r"branches/([^/]+)/(.*)\?view=markup$")
166 r3dir = re.compile(r"trunk/(.*)")
167 r4 = re.compile(r"^([^)]+)\s+\(page does not exist\)$")
168 for a in soup.findAll('a',{'href':True}):
169 #print "LINK:",a.parent
170 m3 = r3.match(a['href'])
171 if m3:
172 t1 = m3.group(1)
173 m3 = r3trunk.match(t1)
174 if m3:
175 t = NavigableString("{{src|%s}}" % m3.group(1))
176 a.replaceWith(t)
177 else:
178 m3 = r3branch.match(t1)
179 if m3:
180 t = NavigableString("{{srcbranch|%s|%s}}" % (m3.group(1),m3.group(2)))
181 a.replaceWith(t)
182 else:
183 m3 = r3dir.match(t1)
184 if m3:
185 t = NavigableString("{{srcdir|%s}}" % m3.group(1))
186 a.replaceWith(t)
187 else:
188 t = NavigableString("[" + a['href'] + " " + a.renderContents() + "]")
189 a.replaceWith(t)
190 #print "LINK:",t
191 elif r.match(a['href']):
192 if a['href'] == a.renderContents():
193 t = NavigableString("[" + a['href'] + "]")
194 else:
195 t = NavigableString("[" + a['href'] + " " + a.renderContents() + "]")
196 a.replaceWith(t)
197 #print "LINK:",t
198
199 elif r2.match(a['href']):
200 if linkified(a.renderContents()) == a['href'][1:]:
201 t = NavigableString("[[" + a.renderContents() + "]]")
202 else:
203 t = NavigableString("[[" + a['href'][1:] + "|" + a.renderContents() + "]]")
204 a.replaceWith(t)
205 #print "LINK:",t
206
207 else:
208 m4 = r4.match(a['title'])
209 if m4:
210 t = NavigableString("[[" + m4.group(1) + "]]")
211 a.replaceWith(t)
212
213 def wikify_bold(soup):
214 for b in soup.findAll("b"):
215 #print "BOLD:",b
216 b2 = NavigableString("'''" + b.renderContents() + "'''")
217 b.replaceWith(b2)
218
219 def wikify_italics(soup):
220 for i in soup.findAll("i"):
221 i.replaceWith("''" + i.renderContents() + "''")
222
223
224 def wikify_list(l, prefix="*"):
225 #print "WIKIFY L:",l.prettify()
226 s = ""
227 for tag in l.findAll(['ul','ol','li']):
228 if tag.name == "ul":
229 s += wikify_list(tag,prefix+"*")
230 elif tag.name == "ol":
231 s += wikify_list(tag,prefix+"#")
232 elif tag.name == "li":
233 # sometimes nested lists are incorrectly placed within a <li>
234 #print "STUFF IN LI:"
235 if tag.findAll(["ol","ul"]):
236 for stuff in tag:
237 if isinstance(stuff,Tag) and stuff.name == "ol":
238 s += wikify_list(stuff,prefix + "#")
239 elif isinstance(stuff,Tag) and stuff.name == "ul":
240 s += wikify_list(stuff,prefix + "*")
241 elif isinstance(stuff,NavigableString) and stuff.string.strip():
242 s += "\n" + prefix + " " + stuff.string.strip()
243 else:
244 s += "\n" + prefix + " " + tag.renderContents().strip()
245
246
247 #print "\n\nRESULT OF WIKIFY L:",s,"\n\n"
248
249 return s
250
251 def wikify_lists(soup):
252 # FIXME handle nested lists!
253 for ul in soup.findAll("ul"):
254 ul.replaceWith(NavigableString(wikify_list(ul,"*")))
255 for ol in soup.findAll("ol"):
256 ol.replaceWith(NavigableString(wikify_list(ol,"#")))
257
258 def wikify_tables(soup):
259 for ta in soup.findAll("table"):
260 s = '\n{| class="wikitable"\n'
261 for tr in ta.findAll("tr"):
262 s += "|-\n"
263 for t in tr.findAll(["td",'th']):
264 if t.name == "td":
265 s += "| " + t.renderContents()
266 else:
267 s += "! " + t.renderContents()
268 s += "|}"
269 ta.replaceWith(NavigableString(s))
270
271
272 def html2wiki(html,wikiname):
273 """
274 This is the main function that converts an HTML string into corresponding wiki syntax.
275 It expects a full HTML page including header, footer, etc, not just the 'content' section
276 of the page.
277
278 @param html the raw 'page source' input (eg from Google Cache)
279 @param wikiname the name of the wiki from which the content is derived
280 """
281 s = BeautifulSoup(html)
282
283 title = s.title.string
284
285 if(title[-9:] != " - " + wikiname):
286 print "Incorrect title '%s'" % s.title.string
287 sys.exit(1)
288
289 title = title[:-9]
290 #print " page title = '%s'" % title
291
292 import re
293 r = re.compile("<!-- start content -->(.*)<!-- end content -->",re.S);
294
295 pagecontent = r.search(html).group(1)
296
297 #print "Parsing page content..."
298 s1 = BeautifulSoup(pagecontent)
299
300 replace_templates(s1)
301 strip_contents(s1)
302 strip_wiki_comments(s1)
303 strip_script(s1)
304 strip_printfooter(s1)
305 strip_highlight(s1)
306 strip_anchors(s1)
307 wikify_headings(s1)
308 wikify_paragraphs(s1)
309 wikify_categories(s1)
310 s1 = BeautifulSoup(str(s1))
311 wikify_bold(s1)
312 s1 = BeautifulSoup(str(s1))
313 wikify_italics(s1)
314 s1 = BeautifulSoup(str(s1))
315 wikify_images(s1)
316 wikify_math(s1)
317 wikify_indents(s1)
318
319 wikify_links(s1)
320
321 wikify_lists(s1)
322 wikify_tables(s1)
323
324 # TODO: do something to catch 'texhtml'?
325
326 return str(s1),title
327
328 if __name__=="__main__":
329 sys.stderr.write("Reading file %s...\n"%sys.argv[1])
330 f = open(sys.argv[1]).read()
331 wikiname = "ASCEND"
332 print html2wiki(f,wikiname)
333
334

Properties

Name Value
svn:executable *

john.pye@anu.edu.au
ViewVC Help
Powered by ViewVC 1.1.22