Skip to content

Commit

Permalink
added basic webapp
Browse files Browse the repository at this point in the history
  • Loading branch information
heikoheiko committed Nov 18, 2012
1 parent 4daddb4 commit a55f8b9
Show file tree
Hide file tree
Showing 12 changed files with 125 additions and 51 deletions.
8 changes: 8 additions & 0 deletions .idea/libraries/sass_stdlib.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,31 @@
wikifunken
==========

Tools to bring Wikipedia Offline
Tools to bring Wikipedia Offline





# Select NUM=1000 articles to donload and select
~/dev/wikifunken/ ./articleselector.py scored_articles.15.11.12.txt 1000 > data/articles.txt
selected 1000 of 1139809 articles with minimum rank 1846

# Fetch articles
~/dev/wikifunken/ ./fetcharticles.py data/articles.txt data/articles/

# fetch images
~/dev/wikifunken/ ./getimagelinks.py data/articles/ | sort | uniq > data/images.txt




# define layout

# Process Articles
# rewrites links, removes unwanted sections, ...

# index articles
#


15 changes: 9 additions & 6 deletions cimagedownloader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
#!/usr/bin/env python
"""
based on: https://github.com/gwik/geventhttpclient
This one totally rocks, fetches 100 images / second from wikipedia servers
avg. image size is 17.4K
1,7MB/s ~ 13 MBit / sec
downloads the 400K images for a 50K collection in 1 hour
"""
import sys, os, hashlib
import gevent.pool
from geventhttpclient import HTTPClient
from geventhttpclient.url import URL
user_agent = 'Wikipedia 1.0 Bot'
import layout
#user_agent = 'Wikipedia 1.0 Bot'

http_clients = dict()

Expand All @@ -29,9 +35,6 @@ def fetch(http, url, fn, pool, num):
else:
print 'err', response.status_code, url

def url2fn(url):
ext = url.split('.')[-1]
return hashlib.md5(url).hexdigest() + '.' + ext

def main():
urls_fn = sys.argv[1]
Expand All @@ -45,7 +48,7 @@ def main():
for i,url in enumerate(open(urls_fn)):
#if i> 400: break
url = url.strip()
fn = os.path.join(images_dir, url2fn(url))
fn = os.path.join(images_dir, layout.url2fn(url))
if not os.path.exists(fn):
http = get_client(url)
pool.spawn(fetch, http, url, fn, pool, i)
Expand Down
4 changes: 2 additions & 2 deletions fetcharticles.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def safe_fn(fn):

def fetch(name):
url = 'http://en.m.wikipedia.org/wiki/' + urllib2.quote(name.encode('utf8'), safe='')
print 'fetching', url
req = urllib2.Request(url, headers={'User-Agent':user_agent})
res = urllib2.urlopen(req)
return res.read()
Expand All @@ -33,9 +32,10 @@ def main(articles_fn, out_dir):
# print 'already there', fn
continue
try:
print 'fetching', i, name
html = fetch(name)
except (urllib2.HTTPError,KeyError, urllib2.URLError) , e:
print repr(name), name, e
print i, repr(name), name, e
continue
open(fn, 'w').write(html)
time.sleep(fetch_delay)
Expand Down
5 changes: 4 additions & 1 deletion getimagelinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@

import sys, os
from lxml import etree
import layout



def parse_article(fn):
parser = etree.HTMLParser()
tree = etree.parse(open(fn), parser)
e = tree.getroot()
for i in e.xpath('.//img'):
try:
print 'http:%s' % i.get('src')
print layout.norm_ext_img_url(i.get('src'))
except UnicodeEncodeError:
sys.stderr.write('UnicodeError %s %r \n' %(fn, i.get('src')))

Expand Down
15 changes: 4 additions & 11 deletions imagedownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,16 @@
searching for new urls, and dispatching new fetches. The GreenPool
acts as sort of a job coordinator (and concurrency controller of
course).
"""
"""
!!!! alternative: https://github.com/gwik/geventhttpclient
"""
"""
from __future__ import with_statement

from eventlet.green import urllib2
import eventlet
import sys, os, hashlib, time
import layout

pool_size = 10
pool_size = 20
pool = eventlet.GreenPool(pool_size)
user_agent = 'Wikipedia 1.0 Bot'

Expand Down Expand Up @@ -52,18 +50,13 @@ def fetchall(urlfns):
pool.spawn_n(fetch, urlfns.pop(), urlfns)
pool.waitall()

def url2fn(url):
ext = url.split('.')[-1]
return hashlib.md5(url).hexdigest() + '.' + ext


def main():
urls_fn = sys.argv[1]
images_dir = sys.argv[2]
urlfns = []
for url in open(urls_fn):
url = url.strip()
fn = os.path.join(images_dir, url2fn(url))
fn = os.path.join(images_dir, layout.ext_img_url2fn(url))
if not os.path.exists(fn):
urlfns.append((url, fn))
fetchall(urlfns)
Expand Down
20 changes: 2 additions & 18 deletions indexarticles.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,6 @@
#!/usr/bin/env python
'''
>>> from whoosh.fields import *
>>> schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
>>> ix = create_in("indexdir", schema)
>>> writer = ix.writer()
>>> writer.add_document(title=u"First document", path=u"/a",
... content=u"This is the first document we've added!")
>>> writer.add_document(title=u"Second document", path=u"/b",
... content=u"The second one is even more interesting!")
>>> writer.commit()
>>> from whoosh.qparser import QueryParser
>>> with ix.searcher() as searcher:
... query = QueryParser("content", ix.schema).parse("first")
... results = searcher.search(query)
... results[0]
...
{"title": u"First document", "path": u"/a"}
'''
import sys, os
import urllib2
Expand All @@ -42,7 +26,7 @@ def get_text(fn):
Main Content:
e.xpath('.//div[@class="show "]//text()')
'''
text = u''.join(e.xpath('.//div[@class="show "]//text()'))
text = u''.join(e.xpath('.//div[@class="show "]//text()')) # OPTIMIZE ME
return text


Expand All @@ -64,7 +48,7 @@ def main(articles_fn, articles_dir, index_dir):
# print text[:400]
writer.add_document(title=name, path=fn, content=text)

writer.commit()
writer.commit(optimize=True) # optimize did not result in any size improvements
print '%d docs in index' % ix.doc_count()


Expand Down
42 changes: 32 additions & 10 deletions layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,45 @@
Relative URLs:
site/dewp/
site/enwp/
site/enwp/articles/c/0/Mainz
site/enwp/articles/f/9/AC/DC
site/enwp/articles/
site/enwp/articles/Mainz
site/enwp/articles/AC%20DC
site/enwp/images/
site/enwp/search/
site/enwp/resources/ (css, js)
site/enwp/images/00016aece4b15ed6c0931ffe29b400fd.jpeg
site/enwp/index.html
site/enwp/search.py
site/enwp/jquery.js
site/enwp/styles.css
Q: Navigation?
Q: What About Categories?
All local site links are relative
Resources as seen from article are:
../images/00016aece4b15ed6c0931ffe29b400fd.jpeg
../search.py
../jquery.js
../styles.css
Navigation?
What About Categories?
'''

import sha
import hashlib, os

def path_by_name(base_path, name):

def norm_ext_img_url(imgsrc):
if not imgsrc.startswith('http://'):
imgsrc = 'http:%s' % imgsrc
return imgsrc

def ext_img_url2fn(url):
ext = url.split('.')[-1]
return hashlib.md5(url).hexdigest() + '.' + ext

def normalize_external_url(url):
def ext_img_url2local_url(url):
pass

4 changes: 2 additions & 2 deletions parsearticles.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def parse_article(fn):
tree = etree.parse(open(fn), parser)
e = tree.getroot()
# links
#for a in e.xpath('.//a'):
# print a, a.get('name'), a.get('href'), [(e, e.get('class')) for e in a.xpath('ancestor-or-self::*[@class]')]
for a in e.xpath('.//a'):
print a, a.get('name'), a.get('href'), [(e, e.get('class')) for e in a.xpath('ancestor-or-self::*[@class]')]

# images

Expand Down
1 change: 1 addition & 0 deletions search.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python
'''
pip install whoosh
'''
import sys, os
import urllib2
Expand Down
30 changes: 30 additions & 0 deletions www/webapp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python
__author__ = 'heiko'
import os
import flask
app = flask.Flask(__name__)
pwd = os.path.dirname(__file__)
articles_dir = os.path.join(pwd, 'articles')


@app.route('/')
def index():
return 'Hello Homepage'


@app.route('/search/')
def search():
query = flask.request.form['query']
return 'Searched for %s' % query

@app.route('/wiki/<path:name>')
def page(name):
try:
assert not '..' in name
return open(os.path.join(articles_dir, name)).read()
except IOError:
flask.abort(404)


if __name__ == '__main__':
app.run(debug=True)

0 comments on commit a55f8b9

Please sign in to comment.