added basic webapp

heikoheiko · Nov 18, 2012 · a55f8b9 · a55f8b9
1 parent 4daddb4
commit a55f8b9
Show file tree

Hide file tree

Showing 12 changed files with 125 additions and 51 deletions.
diff --git a/.idea/libraries/sass_stdlib.xml b/.idea/libraries/sass_stdlib.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/README.md b/README.md
@@ -1,4 +1,31 @@
 wikifunken
 ==========
 
-Tools to bring Wikipedia Offline
+Tools to bring Wikipedia Offline
+
+
+
+
+
+# Select NUM=1000 articles to donload and select
+~/dev/wikifunken/ ./articleselector.py scored_articles.15.11.12.txt 1000 > data/articles.txt
+selected 1000 of 1139809 articles with minimum rank 1846
+
+# Fetch articles
+~/dev/wikifunken/ ./fetcharticles.py data/articles.txt data/articles/
+
+# fetch images
+~/dev/wikifunken/ ./getimagelinks.py data/articles/ | sort | uniq > data/images.txt
+
+
+
+
+# define layout
+
+# Process Articles
+# rewrites links, removes unwanted sections, ...
+
+# index articles
+#
+
+
diff --git a/cimagedownloader.py b/cimagedownloader.py
@@ -1,12 +1,18 @@
 #!/usr/bin/env python
 """
 based on: https://github.com/gwik/geventhttpclient
+This one totally rocks, fetches 100 images / second from wikipedia servers
+avg. image size is 17.4K
+1,7MB/s ~ 13 MBit / sec
+
+downloads the 400K images for a 50K collection in 1 hour
+
 """
 import sys, os, hashlib
 import gevent.pool
 from geventhttpclient import HTTPClient
-from geventhttpclient.url import URL
-user_agent = 'Wikipedia 1.0 Bot'
+import layout
+#user_agent = 'Wikipedia 1.0 Bot'
 
 http_clients = dict()
 
@@ -29,9 +35,6 @@ def fetch(http, url, fn, pool, num):
     else:
         print 'err', response.status_code, url
 
-def url2fn(url):
-    ext = url.split('.')[-1]
-    return hashlib.md5(url).hexdigest() + '.' + ext
 
 def main():
     urls_fn = sys.argv[1]
@@ -45,7 +48,7 @@ def main():
     for i,url in enumerate(open(urls_fn)):  
 	#if i> 400: break
         url = url.strip()
-        fn = os.path.join(images_dir, url2fn(url))
+        fn = os.path.join(images_dir, layout.url2fn(url))
         if not os.path.exists(fn):
             http = get_client(url)
             pool.spawn(fetch, http, url, fn, pool, i)

diff --git a/fetcharticles.py b/fetcharticles.py
@@ -17,7 +17,6 @@ def safe_fn(fn):
 
 def fetch(name):
     url = 'http://en.m.wikipedia.org/wiki/' + urllib2.quote(name.encode('utf8'), safe='')
-    print 'fetching', url
     req = urllib2.Request(url, headers={'User-Agent':user_agent})
     res = urllib2.urlopen(req)
     return res.read()
@@ -33,9 +32,10 @@ def main(articles_fn, out_dir):
  #           print 'already there', fn
             continue
         try:
+            print 'fetching', i, name
             html = fetch(name)
         except (urllib2.HTTPError,KeyError, urllib2.URLError) , e:
-            print repr(name), name, e
+            print i, repr(name), name, e
             continue
         open(fn, 'w').write(html)
         time.sleep(fetch_delay)

diff --git a/getimagelinks.py b/getimagelinks.py
@@ -14,14 +14,17 @@
 
 import sys, os
 from lxml import etree
+import layout
+
+
 
 def parse_article(fn):
     parser = etree.HTMLParser()
     tree   = etree.parse(open(fn), parser)
     e = tree.getroot()
     for i in e.xpath('.//img'):
         try:
-            print 'http:%s' % i.get('src')
+            print layout.norm_ext_img_url(i.get('src'))
         except UnicodeEncodeError:
             sys.stderr.write('UnicodeError %s %r \n' %(fn, i.get('src')))
 

diff --git a/imagedownloader.py b/imagedownloader.py
@@ -9,18 +9,16 @@
 searching for new urls, and dispatching new fetches.  The GreenPool
 acts as sort of a job coordinator (and concurrency controller of
 course).
-"""
-"""
-!!!! alternative: https://github.com/gwik/geventhttpclient
-"""
 
+"""
 from __future__ import with_statement
 
 from eventlet.green import urllib2
 import eventlet
 import sys, os, hashlib, time
+import layout
 
-pool_size = 10
+pool_size = 20
 pool = eventlet.GreenPool(pool_size)
 user_agent = 'Wikipedia 1.0 Bot'
 
@@ -52,18 +50,13 @@ def fetchall(urlfns):
         pool.spawn_n(fetch, urlfns.pop(), urlfns)
     pool.waitall()
 
-def url2fn(url):
-    ext = url.split('.')[-1]
-    return hashlib.md5(url).hexdigest() + '.' + ext
-
-
 def main():
     urls_fn = sys.argv[1]
     images_dir = sys.argv[2]
     urlfns = []
     for url in open(urls_fn):
         url = url.strip()
-        fn = os.path.join(images_dir, url2fn(url))
+        fn = os.path.join(images_dir, layout.ext_img_url2fn(url))
         if not os.path.exists(fn):
             urlfns.append((url, fn))
     fetchall(urlfns)

diff --git a/indexarticles.py b/indexarticles.py
@@ -1,22 +1,6 @@
 #!/usr/bin/env python
 '''
 
->>> from whoosh.fields import *
->>> schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
->>> ix = create_in("indexdir", schema)
->>> writer = ix.writer()
->>> writer.add_document(title=u"First document", path=u"/a",
-...                     content=u"This is the first document we've added!")
->>> writer.add_document(title=u"Second document", path=u"/b",
-...                     content=u"The second one is even more interesting!")
->>> writer.commit()
->>> from whoosh.qparser import QueryParser
->>> with ix.searcher() as searcher:
-...     query = QueryParser("content", ix.schema).parse("first")
-...     results = searcher.search(query)
-...     results[0]
-...
-{"title": u"First document", "path": u"/a"}
 '''
 import sys, os
 import urllib2
@@ -42,7 +26,7 @@ def get_text(fn):
     Main Content:
     e.xpath('.//div[@class="show "]//text()')
     '''
-    text = u''.join(e.xpath('.//div[@class="show "]//text()'))
+    text = u''.join(e.xpath('.//div[@class="show "]//text()')) # OPTIMIZE ME
     return text
 
 
@@ -64,7 +48,7 @@ def main(articles_fn, articles_dir, index_dir):
 #        print text[:400]
         writer.add_document(title=name, path=fn, content=text)
 
-    writer.commit()
+    writer.commit(optimize=True) # optimize did not result in any size improvements
     print '%d docs in index' % ix.doc_count()
 
 

diff --git a/layout.py b/layout.py
@@ -7,23 +7,45 @@
 
 Relative URLs:
 site/dewp/
-site/enwp/
-site/enwp/articles/c/0/Mainz
-site/enwp/articles/f/9/AC/DC
+site/enwp/articles/
+site/enwp/articles/Mainz
+site/enwp/articles/AC%20DC
 site/enwp/images/
-site/enwp/search/
-site/enwp/resources/ (css, js)
+site/enwp/images/00016aece4b15ed6c0931ffe29b400fd.jpeg
+
+site/enwp/index.html
+site/enwp/search.py
+site/enwp/jquery.js
+site/enwp/styles.css
+
+
+
+Q: Navigation?
+Q: What About Categories?
+
+All local site links are relative
+
+Resources as seen from article are:
+../images/00016aece4b15ed6c0931ffe29b400fd.jpeg
+../search.py
+../jquery.js
+../styles.css
 
 
-Navigation?
-What About Categories?
 '''
 
-import sha
+import hashlib, os
 
-def path_by_name(base_path, name):
 
+def norm_ext_img_url(imgsrc):
+    if not imgsrc.startswith('http://'):
+        imgsrc = 'http:%s' % imgsrc
+    return imgsrc
 
+def ext_img_url2fn(url):
+    ext = url.split('.')[-1]
+    return hashlib.md5(url).hexdigest() + '.' + ext
 
-def normalize_external_url(url):
+def ext_img_url2local_url(url):
     pass
+
diff --git a/parsearticles.py b/parsearticles.py
@@ -20,8 +20,8 @@ def parse_article(fn):
     tree   = etree.parse(open(fn), parser)
     e = tree.getroot()
     # links
-    #for a in e.xpath('.//a'):
-    #    print a, a.get('name'),  a.get('href'), [(e, e.get('class')) for e in a.xpath('ancestor-or-self::*[@class]')]
+    for a in e.xpath('.//a'):
+        print a, a.get('name'),  a.get('href'), [(e, e.get('class')) for e in a.xpath('ancestor-or-self::*[@class]')]
 
     # images
 

diff --git a/search.py b/search.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 '''
+pip install whoosh
 '''
 import sys, os
 import urllib2

diff --git a/www/webapp.py b/www/webapp.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+__author__ = 'heiko'
+import os
+import flask
+app = flask.Flask(__name__)
+pwd = os.path.dirname(__file__)
+articles_dir = os.path.join(pwd, 'articles')
+
+
+@app.route('/')
+def index():
+    return 'Hello Homepage'
+
+
+@app.route('/search/')
+def search():
+    query = flask.request.form['query']
+    return 'Searched for %s' % query
+
+@app.route('/wiki/<path:name>')
+def page(name):
+    try:
+        assert not '..' in name
+        return open(os.path.join(articles_dir, name)).read()
+    except IOError:
+        flask.abort(404)
+
+
+if __name__ == '__main__':
+    app.run(debug=True)