Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DokuWiki dump alpha #243

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
389 changes: 389 additions & 0 deletions dokuwikidump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,389 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# dumpgenerator.py A generator of dumps for wikis
# Copyright (C) 2011-2014 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki

try:
from BeautifulSoup import BeautifulSoup
except:
print 'Need BeautifulSoup for current version. In the future it should use regex for scraping.'

import HTMLParser
import urlparse
import requests
import os
import socket
import re
from datetime import datetime
import gzip
import time


def getTitles(url, ns=None):
"""Get titles given a doku.php URL and an (optional) namespace"""
titles = []
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
params = {'call': 'index'}
if ns:
params['idx'] = ns
else:
print 'Finding titles'
ns = ns or ''
depth = len(ns.split(':'))
if ns:
print '%sLooking in namespace %s' % (' ' * depth, ns)
r = requests.post(ajax, params)
if r.status_code != 200 or "AJAX call 'index' unknown!" in r.text:
return getTitlesOld(url, ns=None)
soup = BeautifulSoup(r.text)
for a in soup.findAll('a', href=True):
if a.has_key('title'):
title = a['title']
else:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
title = (query['idx' if 'idx' in query else 'id'])[0]
if a['class'] == 'idx_dir':
titles += getTitles(url, title)
else:
titles.append(title)
time.sleep(1.5)
print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
return titles


def getTitlesOld(url, ns=None, ancient=False):
"""Get titles using the doku.php?do=index"""

titles = []
params = {'do': 'index'}

if ns:
params['idx'] = ns
ns = ns or ''
depth = len(ns.split(':'))

r = requests.get(url, params=params)
soup = BeautifulSoup(r.text).findAll('ul', {'class': 'idx'})[0]
attr = 'text' if ancient else 'title'

if ns:
print '%sSearching in namespace %s' % (' ' * depth, ns)

def match(href):
if not href:
return False
qs = urlparse.urlparse(href).query
qs = urlparse.parse_qs(qs)
return 'idx' in qs and qs['idx'][0] in (ns, ':' + ns)
result = soup.findAll(
'a', {
'class': 'idx_dir', 'href': match})[0].findAllPrevious('li')[0].findAll(
'a', {
'href': lambda x: x and not match(x)})
else:
print 'Finding titles (?do=index)'
result = soup.findAll('a')

for a in result:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
if a['class'] == 'idx_dir':
titles += getTitlesOld(url, query['idx'][0])
else:
titles.append(query['id'][0])

print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')

return titles


def getSourceExport(url, title, rev=''):
"""Export the raw source of a page (at a given revision)"""

r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'export_raw'})
return r.text


def getSourceEdit(url, title, rev=''):
"""Export the raw source of a page by scraping the edit box content. Yuck."""

r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'edit'})
soup = BeautifulSoup(r.text)
return ''.join(soup.find('textarea', {'name': 'wikitext'}).contents).strip()


def domain2prefix(url):
""" Convert domain name to a valid prefix filename. """

domain = url

domain = domain.lower()
domain = re.sub(r'(https?://|www\.|/doku\.php)', '', domain)
domain = re.sub(r'/', '_', domain)
domain = re.sub(r'\.', '', domain)
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)

return domain


def getRevisions(url, title, use_hidden_rev=False, select_revs=False):
""" Get the revisions of a page. This is nontrivial because different versions of DokuWiki return completely different revision HTML."""

revs = []
h = HTMLParser.HTMLParser()
if select_revs:
r = requests.get(url, params={'id': title, 'do': 'diff'})
soup = BeautifulSoup(r.text)
select = soup.find(
'select', {
'class': 'quickselect', 'name': 'rev2[1]'})
for option in select.findAll('option'):
text = option.text
date = ' '.join(text.split(' ')[:2])
username = len(text.split(' ')) > 2 and text.split(' ')[2]
summary = ' '.join(text.split(' ')[3:])

revs.append({'id': option['value'],
'user': username,
'sum': summary,
'date': date})

i = 0
continue_index = -1
cont = True

while cont:
r = requests.get(
url,
params={
'id': title,
'do': 'revisions',
'first': continue_index})

soup = BeautifulSoup(r.text)
lis = soup.findAll(
'div', {
'class': 'level1'})[0].findNext('ul').findAll('li')

for li in lis:
rev = {}
rev_hrefs = li.findAll(
'a', href=lambda href: href and (
'&rev=' in href or '?rev=' in href))
rev['minor'] = ('class', 'minor') in li.attrs

if rev_hrefs:
rev['id'] = urlparse.parse_qs(
urlparse.urlparse(
rev_hrefs[0]['href']).query)['rev'][0]

sum_span = li.findAll('span', {'class': 'sum'})
if sum_span and not select_revs:
sum_span = sum_span[0]
sum_text = sum_span.text.split(' ')[1:]
if sum_span.findAll('bdi'):
rev['sum'] = h.unescape(sum_span.find('bdi').text).strip()
else:
rev['sum'] = h.unescape(' '.join(sum_text)).strip()
elif not select_revs:
print repr(li.text)
wikilink1 = li.find('a', {'class': 'wikilink1'})
text_node = wikilink1 and wikilink1.next and wikilink1.next.next or ''
if text_node.strip:
rev['sum'] = h.unescape(text_node).strip(u'\u2013 \n')

date_span = li.find('span', {'class': 'date'})
if date_span:
rev['date'] = date_span.text.strip()
else:
rev['date'] = ' '.join(li.text.split(' ')[:2])
matches = re.findall(
r'([0-9./]+ [0-9]{1,2}:[0-9]{1,2})',
rev['date'])
if matches:
rev['date'] = matches[0]

if not (select_revs and len(revs) > i and revs[i]['user']):
user_span = li.find('span', {'class': 'user'})
if user_span:
rev['user'] = user_span.text

if select_revs and len(revs) > i:
revs[i].update(rev)
else:
revs.append(rev)
i += 1

first = soup.findAll('input', {'name': 'first', 'value': True})
continue_index = first and max(map(lambda x: x['value'], first))
cont = soup.find('input', {'class': 'button', 'accesskey': 'n'})
time.sleep(1.5)

if revs and use_hidden_rev and not select_revs:
soup2 = BeautifulSoup(requests.get(url, params={'id': title}).text)
revs[0]['id'] = soup2.find(
'input', {
'type': 'hidden', 'name': 'rev', 'value': True})['value']

return revs


def getFiles(url, ns=''):
""" Return a list of media filenames of a wiki """
files = set()
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
medialist = BeautifulSoup(
requests.post(
ajax, {
'call': 'medialist', 'ns': ns, 'do': 'media'}).text)
medians = BeautifulSoup(
requests.post(
ajax, {
'call': 'medians', 'ns': ns, 'do': 'media'}).text)
imagelinks = medialist.findAll(
'a',
href=lambda x: x and re.findall(
'[?&](media|image)=',
x))
for a in imagelinks:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
key = 'media' if 'media' in query else 'image'
files.add(query[key][0])
files = list(files)
namespacelinks = medians.findAll('a', {'class': 'idx_dir', 'href': True})
for a in namespacelinks:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
files += getFiles(url, query['ns'][0])
print 'Found %d files in namespace %s' % (len(files), ns or '(all)')
return files


def dumpContent(url):
os.mkdir(domain2prefix(url) + '/pages')
os.mkdir(domain2prefix(url) + '/attic')
os.mkdir(domain2prefix(url) + '/meta')

titles = getTitles(url)
if not len(titles):
print 'Empty wiki'
return

r1 = requests.get(url, params={'id': titles[0], 'do': 'export_raw'})
r2 = requests.get(url, params={'id': titles[0]})
r3 = requests.get(url, params={'id': titles[0], 'do': 'diff'})

getSource = getSourceExport
if 'html' in r1.headers['content-type']:
getSource = getSourceEdit

soup = BeautifulSoup(r2.text)
hidden_rev = soup.findAll(
'input', {
'type': 'hidden', 'name': 'rev', 'value': True})
use_hidden_rev = hidden_rev and hidden_rev[0]['value']

soup = BeautifulSoup(r3.text)
select_revs = soup.findAll(
'select', {
'class': 'quickselect', 'name': 'rev2[0]'})

for title in titles:
titleparts = title.split(':')
for i in range(len(titleparts)):
dir = "/".join(titleparts[:i])
if not os.path.exists(domain2prefix(url) + '/pages/' + dir):
os.mkdir(domain2prefix(url) + '/pages/' + dir)
if not os.path.exists(domain2prefix(url) + '/meta/' + dir):
os.mkdir(domain2prefix(url) + '/meta/' + dir)
if not os.path.exists(domain2prefix(url) + '/attic/' + dir):
os.mkdir(domain2prefix(url) + '/attic/' + dir)
with open(domain2prefix(url) + '/pages/' + title.replace(':', '/') + '.txt', 'w') as f:
f.write(getSource(url, title).encode("utf-8"))
revs = getRevisions(url, title, use_hidden_rev, select_revs)
for rev in revs[1:]:
if 'id' in rev and rev['id']:
with gzip.open(domain2prefix(url) + '/attic/' + title.replace(':', '/') + '.' + rev['id'] + '.txt.gz', 'w') as f:
f.write(getSource(url, title, rev['id']).encode("utf-8"))
time.sleep(1.5)
print 'Revision %s of %s' % (rev['id'], title)
with open(domain2prefix(url) + '/meta/' + title.replace(':', '/') + '.changes', 'w') as f:
# Loop through revisions in reverse.
for rev in revs[::-1]:
print rev, title
sum = 'sum' in rev and rev['sum'].strip() or ''
id = 0

ip = '127.0.0.1'
user = ''
minor = 'minor' in rev and rev['minor']

if 'id' in rev and rev['id']:
id = rev['id']
else:
# Different date formats in different versions of DokuWiki.
# If no ID was found, make one up based on the date (since rev IDs are Unix times)
# Maybe this is evil. Not sure.

try:
date = datetime.strptime(rev['date'], "%Y/%m/%d %H:%M")
id = str(int(time.mktime(date.utctimetuple())))
except:
date = datetime.strptime(rev['date'], "%d.%m.%Y %H:%M")
id = str(int(time.mktime(date.utctimetuple())))

rev['user'] = rev['user'] if 'user' in rev else 'unknown'
try:
# inet_aton throws an exception if its argument is not an IPv4 address
socket.inet_aton(rev['user'])
ip = rev['user']
except socket.error:
user = rev['user']

row = '\t'.join([id, ip, 'e' if minor else 'E', title, user, sum])
row = row.replace('\n', ' ')
row = row.replace('\r', ' ')

f.write((row + '\n').encode("utf-8"))


def dumpMedia(url):
prefix = domain2prefix(url)
os.mkdir(prefix + '/media')
os.mkdir(prefix + '/media_attic')
os.mkdir(prefix + '/media_meta')

fetch = urlparse.urljoin(url, 'lib/exe/fetch.php')

files = getFiles(url)
for title in files:
titleparts = title.split(':')
for i in range(len(titleparts)):
dir = "/".join(titleparts[:i])
if not os.path.exists(prefix + '/media/' + dir):
os.mkdir(prefix + '/media/' + dir)
with open(prefix + '/media/' + title.replace(':', '/'), 'wb') as f:
f.write(requests.get(fetch, params={'media': title}).content)
print 'File %s' % title
time.sleep(1.5)


def dump(url):
print domain2prefix(url)
os.mkdir(domain2prefix(url))
dumpContent(url)
dumpMedia(url)