/[ascend]/trunk/tools/mediawiki/gcache2wiki/striphf.py
ViewVC logotype

Contents of /trunk/tools/mediawiki/gcache2wiki/striphf.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2188 - (show annotations) (download) (as text)
Tue May 11 03:09:25 2010 UTC (10 years, 5 months ago) by jpye
File MIME type: text/x-python
File size: 6631 byte(s)
Added basic support for lists and tables.
1 #!/usr/bin/env python
2 # Filter to strip the header and footer stuff from the cached page
3
4 from BeautifulSoup import *
5 import sys
6
7 print "Reading file",sys.argv[1],"..."
8
9 f = open(sys.argv[1]).read()
10
11 print "Parsing whole page..."
12
13 s = BeautifulSoup(f);
14
15 title = s.title.string
16
17 if(title[-9:] != " - ASCEND"):
18 print "Incorrect title '%s'" % s.title.string
19 sys.exit(1)
20
21 title = title[:-9]
22 print " page title = '%s'" % title
23
24 import re
25 r = re.compile("<!-- start content -->(.*)<!-- end content -->",re.S);
26
27 pagecontent = r.search(f).group(1)
28
29 print "Parsing page content..."
30 s1 = BeautifulSoup(pagecontent)
31
32 def replace_templates(soup):
33 for t in soup.findAll("div",{"id":"task","class":"notice metadata"}):
34 t.replaceWith(NavigableString("{{task}}"))
35
36 def strip_contents(soup):
37 c = soup.find('table',{'id':'toc','class':'toc','summary':'Contents'})
38 if c:
39 c.extract()
40
41 def strip_script(soup):
42 for s in soup.findAll('script'):
43 s.extract()
44
45 def strip_highlight(soup):
46 for a1 in soup.findAll('p'):
47 if a1.find('style',{'type':"text/css"}):
48 n1 = a1.nextSibling
49 if str(n1.string).strip() != "/* Highlighting theme definition: */":
50 print "NO MATCH"
51 sys.exit(1)
52 n2 = n1.nextSibling
53 #print "REMOVING",str(a1)
54 a1.extract()
55 #print "REMOVING N1",str(n1)
56 n1.extract()
57 n3 = n2.nextSibling
58 #print "REMOVING N2",str(n2)
59 n2.extract()
60 n4= n3.nextSibling
61 #print "REMOVING N3",str(n3)
62 n3.extract()
63 pre = n4.nextSibling
64 #print "REMOVING N4",str(n4)
65 n4.extract()
66
67 if pre.name != 'pre':
68 print "ERROR parsing syntax-highlighting:",pre
69 sys.exit(1)
70 for x in pre.findAll('b',{'style':True}):
71 x1 = NavigableString(str(x.string))
72 x.replaceWith(x1)
73
74 for x in pre.findAll('span',{'class':True}):
75 x1 = NavigableString(str(x.renderContents()))
76 x.replaceWith(x1)
77
78 t = Tag(soup,"src",[("lang",'a4c')])
79 t.insert(0, NavigableString(str(pre.renderContents()).strip()))
80 pre.replaceWith(t)
81
82 def strip_anchors(soup):
83 for a1 in soup.findAll('a',{'name':True}):
84 print "ANCHOR:",a1
85 a1.extract()
86
87 def wikify_headings(soup):
88 for h in soup.findAll(['h1','h2','h3','h4','h5','h6']):
89 if not h.find('span',{'class':'mw-headline'}):
90 print "HEADING: SKIPPING:",h
91 continue
92 print "HEADING:",h
93 level = int(str(h.name)[1:])
94 h2 = NavigableString("="*level + h.span.renderContents() + "="*level)
95 h.replaceWith(h2)
96
97 def wikify_paragraphs(soup):
98 for p in soup.findAll('p'):
99 #print "PARA",str(p)
100 if p.renderContents() is None:
101 p.replaceWith(NavigableString("\n"))
102 else:
103 p.replaceWith(NavigableString("\n" + p.renderContents()))
104
105 def strip_printfooter(soup):
106 soup.find('div',{'class':'printfooter'}).extract()
107
108 def wikify_categories(soup):
109 cats = soup.find("div",{"id":"catlinks"})
110 if not cats:
111 return
112 r2 = re.compile("/[A-Z][a-z_0-9-]*")
113 cc = []
114 for a in cats.findAll("a"):
115 if str(a['href']) == "/Special:Categories":
116 #print "CATEGORIES LINK ignored"
117 a.extract()
118 elif r2.match(a['href']):
119 t = NavigableString("[[" + a['href'][1:] + "]]\n")
120 print " categ:",t.strip()
121 cc.append(t)
122 #print "CATS:",cc
123 #cats.replace(cc)
124 for c in cc:
125 cats.parent.append(c)
126 cats.extract()
127
128 def wikify_images(soup):
129 for a in soup.findAll("a",{'class':'image'}):
130 if a.img:
131 if a.img['alt'][0:6] == "Image:":
132 print "IMG1",a.img['alt'][6:]
133 a1 = NavigableString("[[Image:" + a.img['alt'][6:] + "]]")
134 print "-->",a1
135 a.replaceWith(a1)
136 elif a['href'][0:6] == "/File:":
137 print "IMG",a['href'][6:]
138 a1 = NavigableString("[[Image:" + a['href'][6:] + "]]")
139 a.replaceWith(a1)
140 print "-->",a1
141 else:
142 print "CAN'T PROCESS IMAGE LINK",a
143
144 def wikify_links(soup):
145 rr1 = re.compile(" ")
146 def linkified(s):
147 s = rr1.sub("_",s)
148 s = s[0:1].upper() + s[1:]
149 return s
150
151 r = re.compile("^http://")
152 r2 = re.compile("/[A-Z][a-z_0-9-]*")
153 r3 = re.compile(r"^http://ascendcode.cheme.cmu.edu/viewvc.cgi/code/(.*)$");
154 r3trunk = re.compile(r"trunk/(.*)\?view=markup$")
155 r3branch = re.compile(r"branches/([^)]+)/(.*)\?view=markup$")
156 r3dir = re.compile(r"trunk/(.*)")
157 for a in soup.findAll('a',{'href':True}):
158 #print "LINK:",a.parent
159 m3 = r3.match(a['href'])
160 if m3:
161 t1 = m3.group(1)
162 m3 = r3trunk.match(t1)
163 if m3:
164 t = NavigableString("{{src|%s}}" % m3.group(1))
165 a.replaceWith(t)
166 else:
167 m3 = r3branch.match(t1)
168 if m3:
169 t = NavigableString("{{srcbranch|%s|%s}}" % [m3.group(1),m3.group(2)])
170 a.replaceWith(t)
171 else:
172 m3 = r3dir.match(t1)
173 if m3:
174 t = NavigableString("{{srcdir|%s}}" % m3.group(1))
175 a.replaceWith(t)
176 else:
177 t = NavigableString("[" + a['href'] + " " + a.renderContents() + "]")
178 a.replaceWith(t)
179 print " --> ",t
180 elif r.match(a['href']):
181 if a['href'] == a.renderContents():
182 t = NavigableString("[" + a['href'] + "]")
183 else:
184 t = NavigableString("[" + a['href'] + " " + a.renderContents() + "]")
185 a.replaceWith(t)
186 print " --> ",t
187
188 elif r2.match(a['href']):
189 if linkified(a.renderContents()) == a['href'][1:]:
190 t = NavigableString("[[" + a.renderContents() + "]]")
191 else:
192 t = NavigableString("[[" + a['href'][1:] + "|" + a.renderContents() + "]]")
193 a.replaceWith(t)
194 print " --> ",t
195
196 def wikify_bold(soup):
197 for b in soup.findAll("b"):
198 #print "BOLD:",b
199 b2 = NavigableString("'''" + b.renderContents() + "'''")
200 b.replaceWith(b2)
201
202 def wikify_italics(soup):
203 for i in soup.findAll("i"):
204 i.replaceWith("''" + i.renderContents() + "''")
205
206 def wikify_lists(soup):
207 items = []
208 # FIXME handle nested lists!
209 for ul in soup.findAll("ul"):
210 for li in ul.findAll("li"):
211 print "LIST ITEM:",li.renderContents().strip()
212 items += [NavigableString("\n* %s" % li.renderContents().strip())]
213 l2 = Tag(soup,"div")
214 for i in range(len(items)):
215 l2.insert(i,items[i])
216 ul.replaceWith(NavigableString(l2.renderContents()))
217 print "NEW LIST:",l2.renderContents()
218
219 def wikify_tables(soup):
220 for ta in soup.findAll("table"):
221 s = '\n{| class="wikitable"\n'
222 for tr in ta.findAll("tr"):
223 s += "|-\n"
224 for t in tr.findAll(["td",'th']):
225 if t.name == "td":
226 s += "| " + t.renderContents()
227 else:
228 s += "! " + t.renderContents()
229 s += "|}"
230 ta.replaceWith(NavigableString(s))
231
232 replace_templates(s1)
233 strip_contents(s1)
234 strip_script(s1)
235 strip_printfooter(s1)
236 strip_highlight(s1)
237 strip_anchors(s1)
238 wikify_headings(s1)
239 wikify_paragraphs(s1)
240 wikify_categories(s1)
241 s1 = BeautifulSoup(str(s1))
242 wikify_bold(s1)
243 s1 = BeautifulSoup(str(s1))
244 wikify_italics(s1)
245 s1 = BeautifulSoup(str(s1))
246 wikify_images(s1)
247
248 wikify_links(s1)
249
250 wikify_lists(s1)
251 wikify_tables(s1)
252
253 print str(s1)
254 sys.exit(1)
255
256

Properties

Name Value
svn:executable *

john.pye@anu.edu.au
ViewVC Help
Powered by ViewVC 1.1.22