diff --git a/cdx_writer.py b/cdx_writer.py
index a37bb34..da74df8 100755
--- a/cdx_writer.py
+++ b/cdx_writer.py
@@ -11,8 +11,6 @@
 The functions that start with "get_" (as opposed to "parse_") are called by the
 dispatch loop in make_cdx using getattr().
 """
-from warctools import ArchiveRecord #from https://bitbucket.org/rajbot/warc-tools
-from surt      import surt          #from https://github.com/rajbot/surt
 
 import os
 import re
@@ -21,19 +19,32 @@
 import chardet
 import hashlib
 import json
-import urllib
 import urlparse
-from datetime  import datetime
-from optparse  import OptionParser
+from datetime import datetime
+from optparse import OptionParser
+
+
+from warctools import ArchiveRecord
+# from https://bitbucket.org/rajbot/warc-tools
+from surt import surt  # from https://github.com/rajbot/surt
 
 
 class ParseError(Exception):
     pass
 
+
 class CDX_Writer(object):
     # init()
-    #___________________________________________________________________________
-    def __init__(self, file, out_file=sys.stdout, format="N b a m s k r M S V g", use_full_path=False, file_prefix=None, all_records=False, screenshot_mode=False, exclude_list=None, stats_file=None):
+    def __init__(self,
+                 file,
+                 out_file=sys.stdout,
+                 format="N b a m s k r M S V g",
+                 use_full_path=False,
+                 file_prefix=None,
+                 all_records=False,
+                 screenshot_mode=False,
+                 exclude_list=None,
+                 stats_file=None):
 
         self.field_map = {'M': 'AIF meta tags',
                           'N': 'massaged url',
@@ -46,30 +57,32 @@ def __init__(self, file, out_file=sys.stdout, format="N b a m s k r M S V g", us
                           'm': 'mime type',
                           'r': 'redirect',
                           's': 'response code',
-                         }
+                          }
 
-        self.file   = file
+        self.file = file
         self.out_file = out_file
         self.format = format
-        self.all_records  = all_records
+        self.all_records = all_records
         self.screenshot_mode = screenshot_mode
         self.crlf_pattern = re.compile('\r?\n\r?\n')
-        self.response_pattern = re.compile('^application/http;\s*msgtype=response$', re.I)
-
-        #similar to what what the wayback uses:
-        self.fake_build_version = "archive-commons.0.0.1-SNAPSHOT-20120112102659-python"
-
-        #these fields are set for each record in the warc
-        self.offset        = 0
-        self.surt          = None
-        self.mime_type     = None
-        self.headers       = None
-        self.content       = None
-        self.meta_tags     = None
+        self.response_pattern = re.compile(
+            '^application/http;\s*msgtype=response$', re.I)
+
+        # similar to what the wayback uses:
+        self.fake_build_version = (
+            "archive-commons.0.0.1-SNAPSHOT-20120112102659-python")
+
+        # these fields are set for each record in the warc
+        self.offset = 0
+        self.surt = None
+        self.mime_type = None
+        self.headers = None
+        self.content = None
+        self.meta_tags = None
         self.response_code = None
 
-        #Large html files cause lxml to segfault
-        #problematic file was 154MB, we'll stop at 5MB
+        # Large html files cause lxml to segfault
+        # problematic file was 154MB, we'll stop at 5MB
         self.lxml_parse_limit = 5 * 1024 * 1024
 
         if use_full_path:
@@ -81,7 +94,7 @@ def __init__(self, file, out_file=sys.stdout, format="N b a m s k r M S V g", us
 
         if exclude_list:
             if not os.path.exists(exclude_list):
-                raise IOError, "Exclude file not found"
+                raise IOError("Exclude file not found")
             self.excludes = []
             f = open(exclude_list, 'r')
             for line in f:
@@ -94,19 +107,17 @@ def __init__(self, file, out_file=sys.stdout, format="N b a m s k r M S V g", us
 
         if stats_file:
             if os.path.exists(stats_file):
-                raise IOError, "Stats file already exists"
+                raise IOError("Stats file already exists")
             self.stats_file = stats_file
         else:
             self.stats_file = None
 
-
     # parse_http_header()
-    #___________________________________________________________________________
     def parse_http_header(self, header_name):
         if self.headers is None:
             return None
 
-        pattern = re.compile(header_name+':\s*(.+)', re.I)
+        pattern = re.compile(header_name + ':\s*(.+)', re.I)
         for line in iter(self.headers):
             m = pattern.match(line)
             if m:
@@ -114,13 +125,13 @@ def parse_http_header(self, header_name):
         return None
 
     # parse_http_content_type_header()
-    #___________________________________________________________________________
     def parse_http_content_type_header(self, record):
         content_type = self.parse_http_header('content-type')
         if content_type is None:
             return 'unk'
 
-        # some http responses end abruptly: ...Content-Length: 0\r\nConnection: close\r\nContent-Type: \r\n\r\n\r\n\r\n'
+        # some http responses end abruptly: ...Content-Length: 0\r\nConnection:
+        # close\r\nContent-Type: \r\n\r\n\r\n\r\n'
         content_type = content_type.strip()
         if '' == content_type:
             return 'unk'
@@ -134,9 +145,7 @@ def parse_http_content_type_header(self, record):
         else:
             return 'unk'
 
-
     # parse_charset()
-    #___________________________________________________________________________
     def parse_charset(self):
         charset = None
         charset_pattern = re.compile('charset\s*=\s*([a-z0-9_\-]+)', re.I)
@@ -147,7 +156,6 @@ def parse_charset(self):
             if m:
                 charset = m.group(1)
 
-
         if charset is None and self.meta_tags is not None:
             content_type = self.meta_tags.get('content-type')
             if content_type:
@@ -161,15 +169,16 @@ def parse_charset(self):
         return charset
 
     # parse_meta_tags
-    #___________________________________________________________________________
     def parse_meta_tags(self, record):
         """We want to parse meta tags in <head>, even if not direct children.
         e.g. <head><noscript><meta .../></noscript></head>
 
         What should we do about multiple meta tags with the same name?
-        currently, we append the content attribs together with a comma seperator.
+        currently, we append the content attribs together with a comma
+        seperator.
 
-        We use either the 'name' or 'http-equiv' attrib as the meta_tag dict key.
+        We use either the 'name' or 'http-equiv' attrib as the meta_tag dict
+        key.
         """
 
         if not ('response' == record.type and 'text/html' == self.mime_type):
@@ -180,12 +189,12 @@ def parse_meta_tags(self, record):
 
         meta_tags = {}
 
-        #lxml.html can't parse blank documents
+        # lxml.html can't parse blank documents
         html_str = self.content.strip()
         if '' == html_str:
             return meta_tags
 
-        #lxml can't handle large documents
+        # lxml can't handle large documents
         if record.content_length > self.lxml_parse_limit:
             return meta_tags
 
@@ -195,7 +204,7 @@ def parse_meta_tags(self, record):
         # so we're going to give up on lxml and use regexes to parse html :(
 
         meta_tags = {}
-        #we only want to look for meta tags that occur before the </head> tag
+        # we only want to look for meta tags that occur before the </head> tag
         head_limit = None
         m = re.search('(</head>)', html_str, re.I)
         if m:
@@ -208,13 +217,18 @@ def parse_meta_tags(self, record):
             name = None
             content = None
 
-            m = re.search(r'''\b(?:name|http-equiv)\s*=\s*(['"]?)(.*?)(\1)[\s/>]''', x.group(1), re.I)
+            m = re.search(
+                r'''\
+                \b(?:name|http-equiv)\s*=\s*(['"]?)(.*?)(\1)[\s/>]''',
+                x.group(1), re.I)
             if m:
                 name = m.group(2).lower()
             else:
                 continue
 
-            m = re.search(r'''\bcontent\s*=\s*(['"]?)(.*?)(\1)[\s/>]''', x.group(1), re.I)
+            m = re.search(
+                r'''\bcontent\s*=\s*(['"]?)(.*?)(\1)[\s/>]''', x.group(1),
+                re.I)
             if m:
                 content = m.group(2)
             else:
@@ -224,14 +238,12 @@ def parse_meta_tags(self, record):
                 meta_tags[name] = content
             else:
                 if 'refresh' != name:
-                    #for redirect urls, we only want the first refresh tag
+                    # for redirect urls, we only want the first refresh tag
                     meta_tags[name] += ',' + content
 
         return meta_tags
 
-
     # get_AIF_meta_tags() //field "M"
-    #___________________________________________________________________________
     def get_AIF_meta_tags(self, record):
         """robot metatags, if present, should be in this order: A, F, I
         """
@@ -263,7 +275,7 @@ def get_AIF_meta_tags(self, record):
             try:
                 if int(sfps[0]) > 0:
                     s += 'P'
-            except ValueError as ex:
+            except ValueError:
                 pass
 
         if s:
@@ -271,9 +283,7 @@ def get_AIF_meta_tags(self, record):
         else:
             return '-'
 
-
     # get_massaged_url() //field "N"
-    #___________________________________________________________________________
     def get_massaged_url(self, record, use_precalculated_value=True):
         if use_precalculated_value:
             return self.surt
@@ -283,16 +293,14 @@ def get_massaged_url(self, record, use_precalculated_value=True):
         else:
             url = record.url
             if self.screenshot_mode:
-                url = 'http://web.archive.org/screenshot/'+url
+                url = 'http://web.archive.org/screenshot/' + url
 
             try:
                 return surt(url)
             except:
                 return self.get_original_url(record)
 
-
     # get_compressed_record_size() //field "S"
-    #___________________________________________________________________________
     def get_compressed_record_size(self, record):
         size = record.compressed_record_size
         if size is None:
@@ -300,15 +308,11 @@ def get_compressed_record_size(self, record):
 
         return str(size)
 
-
     # get_compressed_arc_file_offset() //field "V"
-    #___________________________________________________________________________
     def get_compressed_arc_file_offset(self, record):
         return str(self.offset)
 
-
     # get_original_url() //field "a"
-    #___________________________________________________________________________
     def get_original_url(self, record):
         if 'warcinfo' == record.type:
             url = 'warcinfo:/%s/%s' % (self.file, self.fake_build_version)
@@ -318,7 +322,8 @@ def get_original_url(self, record):
 
         # There are few arc files from 2002 that have non-ascii characters in
         # the url field. These are not utf-8 characters, and the charset of the
-        # page might not be specified, so use chardet to try and make these usable.
+        # page might not be specified, so use chardet to try and make these
+        # usable.
         if isinstance(url, str):
             try:
                 url.decode('ascii')
@@ -326,21 +331,25 @@ def get_original_url(self, record):
                 enc = chardet.detect(url)
                 if enc and enc['encoding']:
                     if 'EUC-TW' == enc['encoding']:
-                        # We don't have the EUC-TW encoding installed, and most likely
-                        # something is so wrong that we probably can't recover this url
+                        # We don't have the EUC-TW encoding installed,
+                        # and most likely
+                        # something is so wrong that we probably can't recover
+                        # this url
                         url = url.decode('Big5', 'replace')
                     else:
                         url = url.decode(enc['encoding'], 'replace')
                 else:
                     url = url.decode('utf-8', 'replace')
 
-        # Some arc headers contain urls with the '\r' character, which will cause
-        # problems downstream when trying to process this url, so escape it.
+        # Some arc headers contain urls with the '\r' character, which will
+        # cause problems downstream when trying to process this url, so escape
+        # it.
         # While we are at it, replace other newline chars.
         url = url.replace('\r', '%0D')
         url = url.replace('\n', '%0A')
-        url = url.replace('\x0c', '%0C') #formfeed
-        url = url.replace('\x00', '%00') #null may cause problems with downstream C programs
+        url = url.replace('\x0c', '%0C')  # formfeed
+        # null may cause problems with downstream C programs
+        url = url.replace('\x00', '%00')
 
         if self.screenshot_mode:
             url = u'http://web.archive.org/screenshot/' + url
@@ -348,41 +357,37 @@ def get_original_url(self, record):
         return url
 
     # get_date() //field "b"
-    #___________________________________________________________________________
     def get_date(self, record):
-        #warcs and arcs use a different date format
-        #consider using dateutil.parser instead
+        # warcs and arcs use a different date format
+        # consider using dateutil.parser instead
 
         if record.date.isdigit():
             date_len = len(record.date)
             if 14 == date_len:
-                #arc record already has date in the format we need
+                # arc record already has date in the format we need
                 return record.date
             elif 16 == date_len:
-                #some arc records have 16-digit dates: 2000082305410049
+                # some arc records have 16-digit dates: 2000082305410049
                 return record.date[:14]
             elif 18 == date_len:
-                #some arc records have 18-digit dates: 200009180023002953
+                # some arc records have 18-digit dates: 200009180023002953
                 return record.date[:14]
             elif 12 == date_len:
-                #some arc records have 12-digit dates: 200011201434
+                # some arc records have 12-digit dates: 200011201434
                 return record.date + '00'
         elif re.match('^[a-f0-9]+$', record.date):
-            #some arc records have a hex string in the date field
+            # some arc records have a hex string in the date field
             return '-'
 
-        #warc record
+        # warc record
         date = datetime.strptime(record.date, "%Y-%m-%dT%H:%M:%SZ")
         return date.strftime("%Y%m%d%H%M%S")
 
     # get_file_name() //field "g"
-    #___________________________________________________________________________
     def get_file_name(self, record):
         return self.warc_path
 
-
     # is_response()
-    #___________________________________________________________________________
     def is_response(self, content_type):
         if content_type is None:
             return False
@@ -393,9 +398,7 @@ def is_response(self, content_type):
 
         return got_match
 
-
     # get_new_style_checksum() //field "k"
-    #___________________________________________________________________________
     def get_new_style_checksum(self, record):
         """Return a base32-encoded sha1
         For revisit records, return the original sha1
@@ -407,13 +410,18 @@ def get_new_style_checksum(self, record):
                 return '-'
             else:
                 return digest.replace('sha1:', '')
-        elif 'response' == record.type and self.is_response(record.content_type):
+        elif 'response' == (
+            record.type and self.is_response(record.content_type)
+        ):
             digest = record.get_header('WARC-Payload-Digest')
-            #Our patched warc-tools fabricates this header if it is not present in the record
+            # Our patched warc-tools fabricates this header if it is not
+            # present in the record
             return digest.replace('sha1:', '')
         elif 'response' == record.type and self.content is not None:
-            # This is an arc record. Our patched warctools fabricates the WARC-Payload-Digest
-            # header even for arc files so that we don't need to load large payloads in memory
+            # This is an arc record. Our patched warctools fabricates the
+            # WARC-Payload-Digest
+            # header even for arc files so that we don't need to load large
+            # payloads in memory
             digest = record.get_header('WARC-Payload-Digest')
             if digest is not None:
                 return digest.replace('sha1:', '')
@@ -425,9 +433,9 @@ def get_new_style_checksum(self, record):
             return base64.b32encode(h.digest())
 
     # get_mime_type() //field "m"
-    #___________________________________________________________________________
     def get_mime_type(self, record, use_precalculated_value=True):
-        """ See the WARC spec for more info on 'application/http; msgtype=response'
+        """
+        See the WARC spec for more info on 'application/http; msgtype=response'
         http://archive-access.sourceforge.net/warc/warc_file_format-0.16.html#anchor7
         """
 
@@ -440,14 +448,14 @@ def get_mime_type(self, record, use_precalculated_value=True):
             if record.content_type is None:
                 mime_type = 'unk'
             else:
-                #alexa arc files use 'no-type' instead of 'unk'
+                # alexa arc files use 'no-type' instead of 'unk'
                 mime_type = record.content_type.replace('no-type', 'unk')
         elif 'warcinfo' == record.type:
             mime_type = 'warc-info'
         elif self.screenshot_mode and 'metadata' == record.type:
             mime_type = record.content[0]
         else:
-            mime_type = 'warc/'+record.type
+            mime_type = 'warc/' + record.type
 
         try:
             mime_type = mime_type.decode('ascii')
@@ -456,14 +464,12 @@ def get_mime_type(self, record, use_precalculated_value=True):
 
         return mime_type
 
-
     # to_unicode()
-    #___________________________________________________________________________
     @classmethod
     def to_unicode(self, s, charset):
         if isinstance(s, str):
             if charset is None:
-                #try utf-8 and hope for the best
+                # try utf-8 and hope for the best
                 s = s.decode('utf-8', 'replace')
             else:
                 try:
@@ -473,7 +479,6 @@ def to_unicode(self, s, charset):
         return s
 
     # urljoin_and_normalize()
-    #___________________________________________________________________________
     @classmethod
     def urljoin_and_normalize(self, base, url, charset):
         """urlparse.urljoin removes blank fragments (trailing #),
@@ -508,24 +513,26 @@ def urljoin_and_normalize(self, base, url, charset):
         http://www.seomoz.org/trifecta/fetch/page/http://www.example.com/
         """
 
-        url  = self.to_unicode(url, charset)
+        url = self.to_unicode(url, charset)
 
-        #the base url is from the arc/warc header, which doesn't specify a charset
+        # the base url is from the arc/warc header, which doesn't specify a
+        # charset
         base = self.to_unicode(base, 'utf-8')
 
         try:
             joined_url = urlparse.urljoin(base, url)
         except ValueError:
-            #some urls we find in arc files no longer parse with python 2.7,
-            #e.g. 'http://\x93\xe0\x90E\x83f\x81[\x83^\x93\xfc\x97\xcd.com/'
+            # some urls we find in arc files no longer parse with python 2.7,
+            # e.g. 'http://\x93\xe0\x90E\x83f\x81[\x83^\x93\xfc\x97\xcd.com/'
             return '-'
 
         # We were using os.path.normpath, but had to add too many patches
-        # when it was doing the wrong thing, such as turning http:// into http:/
+        # when it was doing the wrong thing, such as turning http:// into
+        # http:/
         m = re.match('(https?://.+?/)', joined_url)
         if m:
             domain = joined_url[:m.end(1)]
-            path   = joined_url[m.end(1):]
+            path = joined_url[m.end(1):]
             if path.startswith('../'):
                 path = path[3:]
             norm_url = domain + re.sub('/[^/]+/\.\./', '/', path)
@@ -540,12 +547,10 @@ def urljoin_and_normalize(self, base, url, charset):
         elif url.endswith('#') and not norm_url.endswith('#'):
             norm_url += '#'
 
-        #encode spaces
+        # encode spaces
         return norm_url.replace(' ', '%20')
 
-
     # get_redirect() //field "r"
-    #___________________________________________________________________________
     def get_redirect(self, record):
         """Aaron, Ilya, and Kenji have proposed using '-' in the redirect column
         unconditionally, after a discussion on Sept 5, 2012. It turns out the
@@ -556,9 +561,11 @@ def get_redirect(self, record):
 
         # response_code = self.response_code
         #
-        # ## It turns out that the refresh tag is being used in both 2xx and 3xx
-        # ## responses, so always check both the http location header and the meta
-        # ## tags. Also, the java version passes spaces through to the cdx file,
+        # ## It turns out that the refresh tag is being used in both 2xx and
+        # ## 3xx
+        # ## responses, so always check both the http location header and the
+        # ## meta
+        # ## tags. Also, the java version passes spaces through to the cdx file
         # ## which might break tools that split cdx lines on whitespace.
         #
         # #only deal with 2xx and 3xx responses:
@@ -574,14 +581,15 @@ def get_redirect(self, record):
         # #elif response_code.startswith('2'):
         # if self.meta_tags and 'refresh' in self.meta_tags:
         #     redir_loc = self.meta_tags['refresh']
-        #     m = re.search('\d+\s*;\s*url=(.+)', redir_loc, re.I) #url might be capitalized
+        #     m = re.search('\d+\s*;\s*url=(.+)', redir_loc, re.I)
+        #     # url might be capitalized
         #     if m:
-        #         return self.urljoin_and_normalize(record.url, m.group(1), charset)
+        #         return self.urljoin_and_normalize(record.url, m.group(1),
+        #         charset)
         #
         # return '-'
 
     # get_response_code() //field "s"
-    #___________________________________________________________________________
     def get_response_code(self, record, use_precalculated_value=True):
         if use_precalculated_value:
             return self.response_code
@@ -596,7 +604,6 @@ def get_response_code(self, record, use_precalculated_value=True):
             return '-'
 
     # split_headers_and_content()
-    #___________________________________________________________________________
     def parse_headers_and_content(self, record):
         """Returns a list of header lines, split with splitlines(), and the content.
         We call splitlines() here so we only split once, and so \r\n and \n are
@@ -605,12 +612,13 @@ def parse_headers_and_content(self, record):
 
         if 'response' == record.type and record.content[1].startswith('HTTP'):
             try:
-                headers, content = self.crlf_pattern.split(record.content[1], 1)
+                headers, content = self.crlf_pattern.split(
+                    record.content[1], 1)
             except ValueError:
                 headers = record.content[1]
                 content = None
             headers = headers.splitlines()
-        elif  self.screenshot_mode and 'metadata' == record.type:
+        elif self.screenshot_mode and 'metadata' == record.type:
             headers = None
             content = record.content[1]
         else:
@@ -619,9 +627,7 @@ def parse_headers_and_content(self, record):
 
         return headers, content
 
-
     # should_exclude()
-    #___________________________________________________________________________
     def should_exclude(self, surt_url):
         if not self.excludes:
             return False
@@ -632,27 +638,26 @@ def should_exclude(self, surt_url):
 
         return False
 
-
     # make_cdx()
-    #___________________________________________________________________________
     def make_cdx(self):
         if isinstance(self.out_file, basestring):
             self.out_file = open(self.out_file, 'wb')
-        self.out_file.write(' CDX ' + self.format + '\n') #print header
+        self.out_file.write(' CDX ' + self.format + '\n')  # print header
 
         if not self.all_records:
-            #filter cdx lines if --all-records isn't specified
-            allowed_record_types     = set(['response', 'revisit'])
+            # filter cdx lines if --all-records isn't specified
+            allowed_record_types = set(['response', 'revisit'])
             disallowed_content_types = set(['text/dns'])
 
         stats = {
             'num_records_processed': 0,
-            'num_records_included':  0,
-            'num_records_filtered':  0,
+            'num_records_included': 0,
+            'num_records_filtered': 0,
         }
 
         fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r")
-        for (offset, record, errors) in fh.read_records(limit=None, offsets=True):
+        for (offset, record, errors) in fh.read_records(limit=None,
+                                                        offsets=True):
             self.offset = offset
 
             if record:
@@ -660,49 +665,61 @@ def make_cdx(self):
                 if self.screenshot_mode:
                     if record.type != 'metadata':
                         continue
-                elif not self.all_records and (record.type not in allowed_record_types or record.content_type in disallowed_content_types):
+                elif not self.all_records and(
+                    record.type not in allowed_record_types or
+                    record.content_type in disallowed_content_types
+                ):
                     continue
 
-                ### arc files from the live web proxy can have a negative content length and a missing payload
-                ### check the content_length from the arc header, not the computed payload size returned by record.content_length
+                # arc files from the live web proxy can have a negative content
+                # length and a missing payload
+                # check the content_length from the arc header, not the
+                # computed payload size returned by record.content_length
                 content_length_str = record.get_header(record.CONTENT_LENGTH)
-                if content_length_str is not None and int(content_length_str) < 0:
+                if (
+                    content_length_str is not None and
+                    int(content_length_str) < 0
+                ):
                     continue
 
-                self.surt = self.get_massaged_url(record, use_precalculated_value=False)
+                self.surt = self.get_massaged_url(
+                    record, use_precalculated_value=False)
                 if self.should_exclude(self.surt):
                     stats['num_records_filtered'] += 1
                     continue
 
-                ### precalculated data that is used multiple times
-                self.headers, self.content = self.parse_headers_and_content(record)
-                self.mime_type             = self.get_mime_type(record, use_precalculated_value=False)
-                self.response_code         = self.get_response_code(record, use_precalculated_value=False)
-                self.meta_tags             = self.parse_meta_tags(record)
+                # precalculated data that is used multiple times
+                self.headers, self.content = self.parse_headers_and_content(
+                    record)
+                self.mime_type = self.get_mime_type(
+                    record, use_precalculated_value=False)
+                self.response_code = self.get_response_code(
+                    record, use_precalculated_value=False)
+                self.meta_tags = self.parse_meta_tags(record)
 
                 s = u''
                 for field in self.format.split():
-                    if not field in self.field_map:
+                    if field not in self.field_map:
                         raise ParseError('Unknown field: ' + field)
 
                     endpoint = self.field_map[field].replace(' ', '_')
                     response = getattr(self, 'get_' + endpoint)(record)
-                    #print self.offset
-                    #print record.compressed_record_size
-                    #print record.content_length
-                    #print record.headers
-                    #print len(self.content)
-                    #print repr(record.content[1])
-                    #print endpoint
-                    #print repr(response)
+                    # print self.offset
+                    # print record.compressed_record_size
+                    # print record.content_length
+                    # print record.headers
+                    # print len(self.content)
+                    # print repr(record.content[1])
+                    # print endpoint
+                    # print repr(response)
                     s += response + ' '
-                self.out_file.write(s.rstrip().encode('utf-8')+'\n')
-                #record.dump()
+                self.out_file.write(s.rstrip().encode('utf-8') + '\n')
+                # record.dump()
                 stats['num_records_included'] += 1
             elif errors:
                 raise ParseError(str(errors))
             else:
-                pass # tail
+                pass  # tail
 
         fh.close()
 
@@ -712,27 +729,46 @@ def make_cdx(self):
             f.close()
 
 # main()
-#_______________________________________________________________________________
 if __name__ == '__main__':
 
     parser = OptionParser(usage="%prog [options] warc.gz [output_file.cdx]")
-    parser.set_defaults(format        = "N b a m s k r M S V g",
-                        use_full_path = False,
-                        file_prefix   = None,
-                        all_records   = False,
-                        screenshot_mode = False,
-                        exclude_list    = None,
-                       )
-
-    parser.add_option("--format",  dest="format", help="A space-separated list of fields [default: '%default']")
-    parser.add_option("--use-full-path", dest="use_full_path", action="store_true", help="Use the full path of the warc file in the 'g' field")
-    parser.add_option("--file-prefix",   dest="file_prefix", help="Path prefix for warc file name in the 'g' field."
-                      " Useful if you are going to relocate the warc.gz file after processing it."
-                     )
-    parser.add_option("--all-records",   dest="all_records", action="store_true", help="By default we only index http responses. Use this flag to index all WARC records in the file")
-    parser.add_option("--screenshot-mode", dest="screenshot_mode", action="store_true", help="Special Wayback Machine mode for handling WARCs containing screenshots")
-    parser.add_option("--exclude-list", dest="exclude_list", help="File containing url prefixes to exclude")
-    parser.add_option("--stats-file", dest="stats_file", help="Output json file containing statistics")
+    parser.set_defaults(format="N b a m s k r M S V g",
+                        use_full_path=False,
+                        file_prefix=None,
+                        all_records=False,
+                        screenshot_mode=False,
+                        exclude_list=None,
+                        )
+
+    parser.add_option("--format",
+                      dest="format",
+                      help="A space-separated list of fields "
+                      "[default: '%default']")
+    parser.add_option("--use-full-path",
+                      dest="use_full_path",
+                      action="store_true",
+                      help="Use the full path of the warc file in the 'g' "
+                      "field")
+    parser.add_option("--file-prefix",
+                      dest="file_prefix",
+                      help="Path prefix for warc file name in the 'g' field."
+                      " Useful if you are going to relocate the warc.gz "
+                      "file after processing it."
+                      )
+    parser.add_option("--all-records",
+                      dest="all_records",
+                      action="store_true",
+                      help="By default we only index http responses."
+                      "Use this flag to index all WARC records in the file")
+    parser.add_option("--screenshot-mode",
+                      dest="screenshot_mode",
+                      action="store_true",
+                      help="Special Wayback Machine mode for handling "
+                      "WARCs containing screenshots")
+    parser.add_option("--exclude-list", dest="exclude_list",
+                      help="File containing url prefixes to exclude")
+    parser.add_option("--stats-file", dest="stats_file",
+                      help="Output json file containing statistics")
 
     (options, input_files) = parser.parse_args(args=sys.argv[1:])
 
@@ -745,11 +781,11 @@ def make_cdx(self):
 
     cdx_writer = CDX_Writer(input_files[0], input_files[1],
                             format=options.format,
-                            use_full_path   = options.use_full_path,
-                            file_prefix     = options.file_prefix,
-                            all_records     = options.all_records,
-                            screenshot_mode = options.screenshot_mode,
-                            exclude_list    = options.exclude_list,
-                            stats_file      = options.stats_file,
-                           )
+                            use_full_path=options.use_full_path,
+                            file_prefix=options.file_prefix,
+                            all_records=options.all_records,
+                            screenshot_mode=options.screenshot_mode,
+                            exclude_list=options.exclude_list,
+                            stats_file=options.stats_file,
+                            )
     cdx_writer.make_cdx()
diff --git a/warctools/__init__.py b/warctools/__init__.py
new file mode 100644
index 0000000..cb7aa56
--- /dev/null
+++ b/warctools/__init__.py
@@ -0,0 +1,13 @@
+from .record import ArchiveRecord
+from .warc import WarcRecord
+from .arc import ArcRecord
+from . import record, warc, arc
+
+__all__= [
+    'ArchiveRecord',
+    'ArcRecord',
+    'WarcRecord',
+    'record',
+    'warc',
+    'arc'
+]
diff --git a/warctools/arc.py b/warctools/arc.py
new file mode 100644
index 0000000..2ba8d15
--- /dev/null
+++ b/warctools/arc.py
@@ -0,0 +1,288 @@
+"""An object to represent arc records"""
+
+import sys
+import re
+import base64
+import hashlib
+import zlib
+
+from .record import ArchiveRecord,ArchiveParser
+from .stream import open_record_stream
+from .archive_detect import register_record_type
+
+# URL<sp>IP-address<sp>Archive-date<sp>Content-type<sp>
+#Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>
+#Archive-length<nl>
+#
+@ArchiveRecord.HEADERS(
+    URL='URL',
+    IP='IP-address',
+    DATE='Archive-date',
+    CONTENT_TYPE = 'Content-type',
+    CONTENT_LENGTH = 'Archive-length',
+    RESULT_CODE = 'Result-code',
+    CHECKSUM = 'Checksum',
+    LOCATION = 'Location',
+    OFFSET = 'Offset',
+    FILENAME = 'Filename',
+)
+class ArcRecord(ArchiveRecord):
+    def __init__(self, headers=None, content=None, errors=None):
+        ArchiveRecord.__init__(self,headers,content,errors)
+
+    @property
+    def type(self):
+        return "response"
+
+    def _write_to(self, out, nl):
+        pass
+
+    @classmethod
+    def make_parser(self):
+        return ArcParser()
+
+
+class ArcRecordHeader(ArcRecord):
+    def __init__(self, headers=None, content=None, errors=None, version=None, raw_headers=None):
+        ArcRecord.__init__(self,headers,content,errors)
+        self.version = version
+        self.raw_headers = raw_headers
+    @property
+    def type(self):
+        return "filedesc"
+    def raw(self):
+        return "".join(self.raw_headers) + self.content[1]
+
+def rx(pat):
+    return re.compile(pat,flags=re.IGNORECASE)
+
+nl_rx=rx('^\r\n|\r|\n$')
+length_rx = rx('^'+ArcRecord.CONTENT_LENGTH+'$')
+type_rx = rx('^'+ArcRecord.CONTENT_TYPE+'$')
+
+class ArcParser(ArchiveParser):
+    def __init__(self):
+        self.version = 0
+        # we don't know which version to parse initially - a v1 or v2 file
+        # so we read the filedesc because the order and number of the headers change
+        # between versions.
+
+        # question? will we get arc fragments?
+        # should we store both headers & detect records by header length?
+        # if we don't know
+
+        self.headers = []
+        self.trailing_newlines = 0
+
+        #raj: alexa arc files don't always have content-type in header
+        self.short_headers = [ArcRecord.URL, ArcRecord.IP, ArcRecord.DATE, ArcRecord.CONTENT_LENGTH]
+
+    def parse(self, stream, offset):
+        record = None
+        content_type = None
+        content_length = None
+        try:
+            line = stream.readline()
+        except zlib.error:
+            #raj: some ARC files contain trailing padding zeros
+            #see DR_crawl22.20030622054102-c/DR_crawl22.20030622142039.arc.gz for an example
+            return (None,(), offset)
+        while not line.rstrip():
+            if not line:
+                return (None,(), offset)
+            self.trailing_newlines-=1
+            if offset:
+                offset += len(line) #rajbot
+            line = stream.readline()
+
+        while line.endswith('\r'):
+            #raj: some ARC record headers contain a url with a '\r' character.
+            #The record header should end in \n, but we may also encounter
+            #malformed header lines that end with \r, so we need to see of we
+            #read the whole header, or just part of the url.
+            if not re.search('(?:\d{1,3}\.){3}\d{1,3} \d{14} \S+ \d+$', line):
+                line += stream.readline()
+            else:
+                break
+
+        if line.startswith('filedesc:'):
+            raw_headers = []
+            raw_headers.append(line)
+            # read headers named in body of record
+            # to assign names to header, to read body of record
+            arc_version_line = stream.readline()
+            raw_headers.append(arc_version_line)
+            arc_names_line = stream.readline()
+            raw_headers.append(arc_names_line)
+
+            arc_version=arc_version_line.strip()
+
+            # configure parser instance
+            self.version = arc_version.split()[0]
+            self.headers = arc_names_line.strip().split()
+
+            # raj: some v1 ARC files are incorrectly sending a v2 header names line
+            if arc_names_line == 'URL IP-address Archive-date Content-type Result-code Checksum Location Offset Filepath Archive-length\n':
+                if arc_version == '1 0 InternetArchive' and 5 == len(line.split(' ')):
+                    self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length']
+
+            # now we have read header field in record body
+            # we can extract the headers from the current record,
+            # and read the length field
+
+            # which is in a different place with v1 and v2
+
+            # read headers
+            arc_headers = self.get_header_list(line.strip().split())
+
+            # extract content, ignoring header lines parsed already
+            content_type, content_length, errors = self.get_content_headers(arc_headers)
+
+            content_length = content_length - len(arc_version_line) - len(arc_names_line)
+
+            record = ArcRecordHeader(headers = arc_headers, version=arc_version, errors=errors, raw_headers=raw_headers)
+
+        else:
+            if not self.headers:
+                #raj: some arc files are missing the filedesc:// line
+                #raise StandardHeader('missing filedesc')
+                self.version = '1'
+                self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length']
+
+            #raj: change the call to split below to only split on space (some arcs have a \x0c formfeed character in the url)
+            headers = self.get_header_list(line.strip().split(' '))
+            content_type, content_length, errors = self.get_content_headers(headers)
+
+            record = ArcRecord(headers = headers, errors=errors)
+
+        ### raj:
+        ### We do this because we don't want to read large records into memory,
+        ### since this was exhasting memory and crashing for large payloads.
+        sha1_digest = None
+        if record.url.startswith('http'):
+            parsed_http_header = False
+            sha1_digest = hashlib.sha1()
+        else:
+            #This isn't a http response so pretend we already parsed the http header
+            parsed_http_header = True
+
+        line = None
+
+        if content_length > 0: ###raj: some arc files have a negative content_length and no payload.
+            content=[]
+            length = 0
+
+            should_skip_content = False
+            if content_length > ArchiveParser.content_length_limit:
+                should_skip_content = True
+
+            while length < content_length:
+                if not parsed_http_header:
+                    line = stream.readline()
+                else:
+                    bytes_to_read = min(content_length-length, 1024)
+                    line = stream.read(bytes_to_read) #TODO: rename variable. may be more than just one line
+                if not line:
+                    #print 'no more data'
+                    break
+
+                if should_skip_content:
+                    if not parsed_http_header:
+                        content.append(line)
+                else:
+                    content.append(line)
+
+                length+=len(line)
+
+                if sha1_digest:
+                    if parsed_http_header:
+                        if length <= content_length:
+                            sha1_digest.update(line)
+                        else:
+                            sha1_digest.update(line[:-(length-content_length)])
+
+                if not parsed_http_header:
+                    if nl_rx.match(line):
+                        parsed_http_header = True
+
+            if sha1_digest:
+                sha1_str = 'sha1:'+base64.b32encode(sha1_digest.digest())
+                record.headers.append(('WARC-Payload-Digest', sha1_str))
+
+            content="".join(content)
+
+            ### note the content_length+1 below, which is not in the WARC parser
+            ### the +1 might be a bug
+            #content, line = content[0:content_length], content[content_length+1:]
+            content = content[0:content_length]
+            if length > content_length:
+                #line is the last line we read
+                trailing_chars = line[-(length-content_length):] #note that we removed the +1 from above
+            else:
+                trailing_chars = ''
+
+            record.content = (content_type, content)
+
+            if trailing_chars:
+                record.error('trailing data at end of record', line)
+            if  trailing_chars == '':
+                self.trailing_newlines = 1
+
+        return (record, (), offset)
+
+    def trim(self, stream):
+        return ()
+
+    def get_header_list(self, values):
+        num_values = len(values)
+
+        #raj: some headers contain urls with unescaped spaces
+        if num_values > 5:
+            if re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]):
+                values = ['%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]]
+                num_values = len(values)
+
+        if 4 == num_values:
+            #raj: alexa arc files don't always have content-type in header
+            return zip(self.short_headers, values)
+        elif 5 == num_values:
+            #normal case
+            #raj: some old alexa arcs have ip-address and date transposed in the header
+            if re.match('^\d{14}$', values[1]) and re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[2]):
+                values[1], values[2] = values[2], values[1]
+
+            return zip(self.headers, values)
+        elif 6 == num_values:
+            #raj: some old alexa arcs have "content-type; charset" in the header
+            v = values[0:4]+values[5:]
+            v[3] = v[3].rstrip(';')
+            return zip(self.headers, v)
+        else:
+            raise Exception('invalid number of header fields')
+
+    @staticmethod
+    def get_content_headers(headers):
+        content_type = None
+        content_length = None
+        errors = []
+
+        for name,value in headers:
+            if type_rx.match(name):
+                if value:
+                    content_type = value
+                else:
+                    errors.append(('invalid header',name,value))
+            elif length_rx.match(name):
+                try:
+                    content_length = int(value)
+                except ValueError:
+                    errors.append(('invalid header',name,value))
+
+        return content_type, content_length, errors
+
+
+register_record_type(re.compile('^filedesc://'), ArcRecord)
+
+#raj: some arc files are missing the filedesc:// line
+url_record_regex = re.compile('^https?://\S+ (?:\d{1,3}\.){3}\d{1,3} \d{14} \S+ \d+$')
+register_record_type(url_record_regex, ArcRecord)
diff --git a/warctools/archive_detect.py b/warctools/archive_detect.py
new file mode 100644
index 0000000..df09f2d
--- /dev/null
+++ b/warctools/archive_detect.py
@@ -0,0 +1,27 @@
+import gzip
+
+archive_types = []
+
+def is_gzip_file(file_handle):
+    signature = file_handle.read(2)
+    file_handle.seek(-len(signature),1)
+    return signature == '\x1f\x8b'
+
+def guess_record_type(file_handle):
+    offset = file_handle.tell()
+    if is_gzip_file(file_handle):
+        nfh=gzip.GzipFile(fileobj=file_handle)
+    else:
+        nfh=file_handle
+    
+    line = nfh.readline()
+    file_handle.seek(offset)
+    for rx, record in archive_types:
+        if rx.match(line):
+            return record
+
+    else:
+        return None
+
+def register_record_type(rx, record):
+    archive_types.append((rx,record))
diff --git a/warctools/log.py b/warctools/log.py
new file mode 100644
index 0000000..39e2c7f
--- /dev/null
+++ b/warctools/log.py
@@ -0,0 +1,9 @@
+import sys
+
+__all__ = ['debug']
+
+debug = lambda:None
+
+if __debug__:
+    def debug(*args):
+        print >> sys.stderr, 'WARCTOOLS',args
diff --git a/warctools/record.py b/warctools/record.py
new file mode 100644
index 0000000..736374a
--- /dev/null
+++ b/warctools/record.py
@@ -0,0 +1,128 @@
+"""a skeleton class for archive records"""
+
+from gzip import GzipFile
+import re
+
+from .stream import open_record_stream
+
+strip = re.compile(r'[^\w\t \|\\\/]')
+
+def add_headers(**kwargs):
+    """a useful helper for defining header names in record formats"""
+    def _add_headers(cls):
+        for k,v in kwargs.iteritems():
+            setattr(cls,k,v)
+        cls._HEADERS = kwargs.keys()
+        return cls
+    return _add_headers
+
+class ArchiveParser(object):
+    """ methods parse, and trim """
+    #To avoid exhausing memory while reading large payloads, don't
+    #store large records.
+    content_length_limit = 5 * 1024 * 1024
+
+
+
+@add_headers(DATE='Date', CONTENT_TYPE='Type', CONTENT_LENGTH='Length', TYPE='Type',URL='Url')
+class ArchiveRecord(object):
+    """An archive record has some headers, maybe some content and
+    a list of errors encountered. record.headers is a list of tuples (name,
+    value). errors is a list, and content is a tuple of (type, data)"""
+    def __init__(self,  headers=None, content=None, errors=None):
+        self.headers = headers if headers else []
+        self.content = content if content else (None, "")
+        self.errors = errors if errors else []
+
+        self.compressed_record_size = None  ###rajbot
+
+    HEADERS=staticmethod(add_headers)
+
+    @property
+    def date(self):
+        return self.get_header(self.DATE)
+
+    def error(self, *args):
+        self.errors.append(args)
+
+    @property
+    def type(self):
+        return self.get_header(self.TYPE)
+
+    @property
+    def content_type(self):
+        return self.content[0]
+
+    @property
+    def content_length(self):
+        return len(self.content[1])
+
+    @property
+    def url(self):
+        return self.get_header(self.URL)
+
+    def get_header(self, name):
+        for k,v in self.headers:
+            if name == k:
+                return v
+
+    def set_header(self, name, value):
+        self.headers = [(k,v) for (k,v) in self.headers if k != name]
+        self.headers.append((name, value))
+
+    def dump(self, content=True):
+        print 'Headers:'
+        for (h,v) in self.headers:
+            print '\t%s:%s'%(h,v)
+        if content and self.content:
+            print 'Content Headers:'
+            content_type, content_body = self.content
+            print '\t',self.CONTENT_TYPE,':',content_type
+            print '\t',self.CONTENT_LENGTH,':',len(content_body)
+            print 'Content:'
+            ln = min(1024, len(content_body))
+            print '\t', strip.sub(lambda x:'\\x%00X'%ord(x.group()),content_body[:ln])
+            print '\t...'
+            print
+        else:
+            print 'Content: none'
+            print
+            print
+        if self.errors:
+            print 'Errors:'
+            for e in self.errors:
+                print '\t', e
+
+    def write_to(self, out, newline='\x0D\x0A', gzip=False):
+        if gzip:
+            out=GzipFile(fileobj=out)
+        self._write_to(out, newline)
+        if gzip:
+            out.flush()
+            out.close()
+
+
+    def _write_to(self, out, newline):
+        raise AssertionError, 'this is bad'
+
+
+    ### class methods for parsing
+    @classmethod
+    def open_archive(cls , filename=None, file_handle=None, mode="rb+", gzip="auto"):
+        """Generically open an archive - magic autodetect"""
+        if cls is ArchiveRecord:
+            cls = None # means guess
+        return open_record_stream(cls, filename, file_handle, mode, gzip)
+
+    @classmethod
+    def make_parser(self):
+        """Reads a (w)arc record from the stream, returns a tuple (record, errors).
+        Either records is null or errors is null. Any record-specific errors are
+        contained in the record - errors is only used when *nothing* could be parsed"""
+        raise StandardError
+
+
+
+
+
+
diff --git a/warctools/stream.py b/warctools/stream.py
new file mode 100644
index 0000000..d394b99
--- /dev/null
+++ b/warctools/stream.py
@@ -0,0 +1,259 @@
+"""Read records from normal file and compressed file"""
+
+import zlib
+import gzip
+import re
+
+from .log import debug
+from .archive_detect import is_gzip_file, guess_record_type
+
+def open_record_stream(record_class=None, filename=None, file_handle=None, mode="rb+", gzip="auto"):
+        """Can take a filename or a file_handle. Normally called indirectly from
+        A record class i.e WarcRecord.open_archive. If the first parameter is None, will try to guess"""
+
+        if file_handle is None:
+            file_handle = open(filename, mode=mode)
+        else:
+            if not filename:
+                filename = file_handle.name
+
+        if record_class == None:
+            record_class = guess_record_type(file_handle)
+
+        if record_class == None:
+            raise StandardError('Failed to guess compression')
+
+        record_parser = record_class.make_parser()
+
+        if gzip == 'auto':
+            if is_gzip_file(file_handle):
+                gzip = 'record'
+                #debug('autodetect: record gzip')
+            else:
+                # assume uncompressed file
+                #debug('autodetected: uncompressed file')
+                gzip = None
+
+
+        if gzip=='record':
+            return GzipRecordStream(file_handle, record_parser)
+        elif gzip=='file':
+            return GzipFileStream(file_handle, record_parser)
+        else:
+            return RecordStream(file_handle, record_parser)
+
+
+class RecordStream(object):
+    """A readable/writable stream of Archive Records. Can be iterated over
+    or read_records can give more control, and potentially offset information.
+    """
+    def __init__(self, file_handle, record_parser):
+        self.fh = file_handle
+        self.record_parser = record_parser
+        self._parser = None
+
+    def seek(self, offset, pos=0):
+        """Same as a seek on a file"""
+        self.fh.seek(offset,pos)
+
+    def read_records(self, limit=1, offsets=True):
+        """Yield a tuple of (offset, record, errors) where
+        Offset is either a number or None.
+        Record is an object and errors is an empty list
+        or record is none and errors is a list"""
+
+        nrecords = 0
+        while nrecords < limit or limit is None:
+            offset, record, errors = self._read_record(offsets)
+            nrecords+=1
+            yield (offset, record,errors)
+            if not record:
+                break
+
+    def __iter__(self):
+        while True:
+            offset, record, errors = self._read_record(offsets=False)
+            if record:
+                yield record
+            elif errors:
+                raise StandardError('Errors while decoding '+",".join(errors))
+            else:
+                break
+
+    def _read_record(self, offsets):
+        """overridden by sub-classes to read individual records"""
+        offset = self.fh.tell() if offsets else None
+        record, errors, offset = self.record_parser.parse(self.fh, offset)
+        if record:
+            #TODO: use compressed_record_size to store size of UNcompressed record for now, rename later
+            record.compressed_record_size = self.fh.tell() - offset
+
+        return offset, record, errors
+
+    def write(self, record):
+        record.write_to(self)
+
+    def close(self):
+        self.fh.close()
+
+class GzipRecordStream(RecordStream):
+    """A stream to read/write concatted file made up of gzipped archive records"""
+    def __init__(self, file_handle, record_parser):
+        RecordStream.__init__(self,file_handle, record_parser)
+        self.gz = None
+
+        ###rajbot
+        self.next_record = None
+        self.next_errors = None
+        self.next_offset = None
+
+    def _read_record(self, offsets):
+        """rajbot: restructure this function to call parse() twice, once
+        to read the record, and once to read possible trailing bytes.
+        This is to ensure that the compressed_record_size is correct.
+        """
+        errors = []
+
+        if self.next_record:
+            record   = self.next_record
+            errors   = self.next_errors
+            offset   = self.next_offset if offsets else None
+
+            self.next_record = None
+            self.next_errors = None
+            self.next_offset = None
+
+            return offset, record, errors
+
+
+        offset_ = self.fh.tell()
+        self.gz = GzipRecordFile(self.fh)
+        record, r_errors, tmp_offset = self.record_parser.parse(self.gz, offset=None)
+
+        next_offset_ = self.fh.tell()
+        next_record, next_errors, tmp_next_offset = self.record_parser.parse(self.gz, offset=None)
+
+        if not next_record:
+            if record:
+                record.compressed_record_size = self.fh.tell() - offset_
+        else:
+            next_record.error('multiple warc records in gzip record file')
+            if record:
+                record.compressed_record_size = next_offset_ - offset_
+            next_record.compressed_record_size = self.fh.tell() - next_offset_
+            self.next_record = next_record
+            self.next_errors = next_errors
+            self.next_offset = next_offset_
+
+        self.gz.close()
+
+        offset  = offset_ if offsets else None
+        errors.extend(r_errors)
+        return offset, record, errors
+
+
+class GzipFileStream(RecordStream):
+    """A stream to read/write gzipped file made up of all archive records"""
+    def __init__(self, file_handle, record):
+        RecordStream.__init__(self,gzip.GzipFile(fileobj=file_handle), record)
+    def _read_record(self, offsets):
+        # no real offsets in a gzipped file (no seperate records)
+        return RecordStream._read_record(self, False)
+
+
+
+### record-gzip handler, based on zlib
+### implements readline() access over a a single
+### gzip-record. must be re-created to read another record
+
+
+CHUNK_SIZE=1024 # the size to read in, make this bigger things go faster.
+line_rx=re.compile('^(?P<line>^[^\r\n]*(?:\r\n|\r(?!\n)|\n))(?P<tail>.*)$',re.DOTALL)
+
+class GzipRecordFile(object):
+    """A file like class providing 'readline' over catted gzip'd records"""
+    def __init__(self, fh):
+        self.fh = fh
+        self.buffer=""
+        self.z = zlib.decompressobj(16+zlib.MAX_WBITS)
+        self.done = False
+
+
+    def _getline(self):
+            if self.buffer:
+                #a,nl,b
+                match=line_rx.match(self.buffer)
+                #print match
+               # print 'split:', split[0],split[1], len(split[2])
+                if match:
+                    output = match.group('line')
+                    self.buffer = ""+match.group('tail')
+                    return output
+                elif self.done:
+                    output = self.buffer
+                    self.buffer = ""
+
+                    return output
+
+    def readline(self):
+        while True:
+            output = self._getline()
+            if output:
+                    return output
+
+            if self.done:
+                return ""
+
+            #print 'read chunk at', self.fh.tell(), self.done
+            chunk = self.fh.read(CHUNK_SIZE)
+            out = self.z.decompress(chunk)
+
+            if out:
+                #rajbot: if the decompressed string ends with a \r, then
+                #read another chunk in case a \n is next. We need to do
+                #this because the line_rx pattern allows \r to be a valid
+                #newline, which causes the following \n to become a blank
+                #line which might prematurely end parsing of warc headers.
+                if out.endswith('\r') and not self.z.unused_data:
+                    next_chunk = self.fh.read(CHUNK_SIZE)
+                    next_out   = self.z.decompress(next_chunk)
+                    if next_out is not None:
+                        out += next_out
+
+                self.buffer+=out
+
+            if self.z.unused_data:
+                #print 'unused', len(self.z.unused_data)
+                self.fh.seek(-len(self.z.unused_data),1)
+                self.done=True
+                continue
+            if not chunk:
+                self.done = True
+                continue
+
+    def read(self, num_bytes):
+        # add bytes to self.buff if the current buffer doesn't have enough
+        # data in it, and we have not exhaused the compressed stream
+        while (len(self.buffer) < num_bytes) and not self.done:
+            chunk = self.fh.read(CHUNK_SIZE)
+            d_bytes = self.z.decompress(chunk)
+            if d_bytes:
+                self.buffer += d_bytes
+
+            if self.z.unused_data:
+                #print 'unused', len(self.z.unused_data)
+                self.fh.seek(-len(self.z.unused_data),1)
+                self.done=True
+                break
+            if not chunk:
+                self.done = True
+                break
+
+        out = self.buffer[:num_bytes]
+        self.buffer = self.buffer[num_bytes:]
+
+        return out
+
+    def close(self):
+        if self.z:
+            self.z.flush()
diff --git a/warctools/warc.py b/warctools/warc.py
new file mode 100644
index 0000000..8c933bc
--- /dev/null
+++ b/warctools/warc.py
@@ -0,0 +1,454 @@
+"""An object to represent warc records, using the abstract record in record.py"""
+
+import re
+import base64
+import hashlib
+from .record import ArchiveRecord,ArchiveParser
+from .archive_detect import register_record_type
+
+bad_lines = 5 # when to give up looking for the version stamp
+
+@ArchiveRecord.HEADERS(
+    DATE='WARC-Date',
+    TYPE = 'WARC-Type',
+    ID = 'WARC-Record-ID',
+    CONCURRENT_TO = 'WARC-Concurrent-To',
+    REFERS_TO = 'WARC-Refers-To',
+    CONTENT_LENGTH = 'Content-Length',
+    CONTENT_TYPE = 'Content-Type',
+    URL='WARC-Target-URI',
+    BLOCK_DIGEST='WARC-Block-Digest',
+    IP_ADDRESS='WARC-IP-Address',
+    FILENAME='WARC-Filename',
+    WARCINFO_ID='WARC-Warcinfo-ID',
+    PAYLOAD_DIGEST = 'WARC-Payload-Digest',
+)
+class WarcRecord(ArchiveRecord):
+    VERSION="WARC/1.0"
+    VERSION18="WARC/0.18"
+    VERSION17="WARC/0.17"
+    RESPONSE="response"
+    REQUEST="request"
+    METADATA="metadata"
+    CONVERSION="conversion"
+    WARCINFO="warcinfo"
+
+    def __init__(self, version=VERSION, headers=None, content=None, errors=None):
+        ArchiveRecord.__init__(self,headers,content,errors)
+        self.version = version
+
+    @property
+    def id(self):
+        return self.get_header(self.ID)
+
+    def _write_to(self, out, nl):
+        """WARC Format:
+            VERSION NL
+            (Key: Value NL)*
+            NL
+            CONTENT NL
+            NL
+
+            don't write multi line headers
+        """
+        out.write(self.version)
+        out.write(nl)
+        for k,v in self.headers:
+            if k not in (self.CONTENT_TYPE, self.CONTENT_LENGTH, self.BLOCK_DIGEST):
+                out.write(k)
+                out.write(": ")
+                out.write(v)
+                out.write(nl)
+        content_type, content_buffer = self.content
+        content_buffer=buffer(content_buffer)
+        if content_type:
+            out.write(self.CONTENT_TYPE)
+            out.write(": ")
+            out.write(content_type)
+            out.write(nl)
+        if content_buffer is None:
+            content_buffer=""
+
+        content_length = len(content_buffer)
+        out.write(self.CONTENT_LENGTH)
+        out.write(": ")
+        out.write(str(content_length))
+        out.write(nl)
+
+        block_hash = hashlib.sha256()
+        block_hash.update(content_buffer)
+
+        digest= "sha256:%s"%block_hash.hexdigest()
+
+        out.write(self.BLOCK_DIGEST)
+        out.write(": ")
+        out.write(digest)
+        out.write(nl)
+
+        # end of header blank nl
+        out.write(nl)
+        if content_buffer:
+            out.write(content_buffer[:content_length])
+        out.write(nl)
+        out.write(nl)
+        out.flush()
+
+    def repair(self):
+        pass
+
+    def validate(self):
+        return self.errors
+
+    @classmethod
+    def make_parser(self):
+        return WarcParser()
+
+def rx(pat):
+    return re.compile(pat,flags=re.IGNORECASE)
+
+version_rx = rx(r'^(?P<prefix>.*?)(?P<version>\s*WARC/(?P<number>.*?))' '(?P<nl>\r\n|\r|\n)\\Z')
+# a header is key: <ws> value plus any following lines with leading whitespace
+header_rx = rx(r'^(?P<name>.*?):\s?(?P<value>.*?)' '(?P<nl>\r\n|\r|\n)\\Z')
+value_rx = rx(r'^\s+(?P<value>.+?)' '(?P<nl>\r\n|\r|\n)\\Z')
+nl_rx=rx('^(?P<nl>\r\n|\r|\n\\Z)')
+length_rx = rx('^'+WarcRecord.CONTENT_LENGTH+'$')
+type_rx = rx('^'+WarcRecord.CONTENT_TYPE+'$')
+
+required_headers = set((
+    WarcRecord.TYPE.lower(),
+    WarcRecord.ID.lower(),
+    WarcRecord.CONTENT_LENGTH.lower(),
+    WarcRecord.DATE.lower(),
+))
+
+class WarcParser(ArchiveParser):
+    KNOWN_VERSIONS=set(('1.0', '0.17', '0.18'))
+    def __init__(self):
+        self.trailing_newlines = 0
+
+    def parse(self,stream, offset):
+        """Reads a warc record from the stream, returns a tuple (record, errors).
+        Either records is null or errors is null. Any record-specific errors are
+        contained in the record - errors is only used when *nothing* could be parsed"""
+        errors = []
+        version = None
+        # find WARC/.*
+        line = stream.readline()
+        newlines = self.trailing_newlines
+        if newlines > 0:
+            while line:
+                match = nl_rx.match(line)
+                if match and newlines > 0:
+                    if offset is not None: offset+=len(line)
+                    newlines-=1
+                    if match.group('nl') != '\x0d\x0a':
+                        errors.append(('incorrect trailing newline', match.group('nl')))
+                    line = stream.readline()
+                    if newlines == 0:
+                        break
+                else:
+                    break
+
+            if newlines > 0:
+                errors+=('less than two terminating newlines at end of previous record, missing', newlines)
+
+        while line:
+            match = version_rx.match(line)
+
+            if match:
+                version = match.group('version')
+                if offset is not None: offset+=len(match.group('prefix'))
+                break
+            else:
+                if offset is not None: offset+=len(line)
+                if not nl_rx.match(line):
+                    errors.append(('ignored line', line))
+                    if len(errors) > bad_lines:
+                        errors.append(('too many errors, giving up hope',))
+                        return (None,errors, offset)
+                line = stream.readline()
+        if not line:
+            if version:
+                errors.append('warc version but no headers', version)
+            self.trailing_newlines = 0
+            return (None, errors, offset)
+        if line:
+            content_length = 0
+            content_type = None
+
+            record = WarcRecord(errors=errors, version=version)
+
+
+            if match.group('nl') != '\x0d\x0a':
+                record.error('incorrect newline in version', match.group('nl'))
+
+            if match.group('number') not in self.KNOWN_VERSIONS:
+                record.error('version field is not known (%s)'%(",".join(self.KNOWN_VERSIONS)), match.group('number'))
+
+
+            prefix = match.group('prefix')
+
+            if prefix:
+                record.error('bad prefix on WARC version header', prefix)
+
+            #Read headers
+            line = stream.readline()
+            while line and not nl_rx.match(line):
+
+                #print 'header', repr(line)
+                match = header_rx.match(line)
+                if match:
+                    if match.group('nl') != '\x0d\x0a':
+                        record.error('incorrect newline in header', match.group('nl'))
+                    name = match.group('name').strip()
+                    value = [match.group('value').strip()]
+                    #print 'match',name, value
+
+                    line = stream.readline()
+                    match = value_rx.match(line)
+                    while match:
+                        #print 'follow', repr(line)
+                        if match.group('nl') != '\x0d\x0a':
+                            record.error('incorrect newline in follow header',line, match.group('nl'))
+                        value.append(match.group('value').strip())
+                        line = stream.readline()
+                        match = value_rx.match(line)
+
+                    value = " ".join(value)
+
+                    if type_rx.match(name):
+                        if value:
+                            content_type = value
+                        else:
+                            record.error('invalid header',name,value)
+                    elif length_rx.match(name):
+                        try:
+                            #print name, value
+                            content_length = int(value)
+                            #print content_length
+                        except ValueError:
+                            record.error('invalid header',name,value)
+                    else:
+                        record.headers.append((name,value))
+
+            # have read blank line following headers
+
+            # read content
+
+            ### rajbot: if the WARC-Payload-Digest is not present, fabricate it.
+            ### We do this because we don't want to read large records into memory,
+            ### since this was exhasting memory and crashing for large payloads.
+            sha1_digest = None
+            if 'response' == record.type and re.match('^application/http;\s*msgtype=response$', content_type):
+                parsed_http_header = False
+                digest = record.get_header(WarcRecord.PAYLOAD_DIGEST)
+                if digest is None:
+                    sha1_digest = hashlib.sha1()
+            else:
+                #This isn't a http response so pretend we already parsed the http header
+                parsed_http_header = True
+
+            if content_length is not None:
+                if content_length > 0:
+                    content=[]
+                    length = 0
+
+                    should_skip_content = False
+                    if content_length > ArchiveParser.content_length_limit:
+                        should_skip_content = True
+
+                    while length < content_length:
+                        if not parsed_http_header:
+                            line = stream.readline()
+                            #print 'header:', line
+                        else:
+                            bytes_to_read = min(content_length-length, 1024)
+                            line = stream.read(bytes_to_read) #TODO: rename variable. may be more than just one line
+                            #line = stream.readline()
+                            #print 'line:', repr(line)
+                        if not line:
+                            #print 'no more data'
+                            break
+
+                        if should_skip_content:
+                            if not parsed_http_header:
+                                content.append(line)
+                        else:
+                            content.append(line)
+
+                        length+=len(line)
+
+                        if sha1_digest:
+                            if parsed_http_header:
+                                if length <= content_length:
+                                    sha1_digest.update(line)
+                                else:
+                                    sha1_digest.update(line[:-(length-content_length)])
+
+                        if not parsed_http_header:
+                            if nl_rx.match(line):
+                                parsed_http_header = True
+
+                        #print length, content_length, line
+                    #else:
+                        # print 'last line of content', repr(line)
+                    if sha1_digest:
+                        sha1_str = 'sha1:'+base64.b32encode(sha1_digest.digest())
+                        record.headers.append((WarcRecord.PAYLOAD_DIGEST, sha1_str))
+
+                    content="".join(content)
+
+                    if length > content_length:
+                        #line is the last line we read
+                        trailing_chars = line[-(length-content_length):]
+                    else:
+                        trailing_chars = ''
+
+                    #content, line = content[0:content_length], content[content_length:]
+                    content = content[0:content_length]
+
+                    #if len(content)!= content_length:
+                    if length < content_length:
+                        record.error('content length mismatch (is, claims)', length, content_length)
+
+                    record.content = (content_type, content)
+
+                    if nl_rx.match(trailing_chars):
+                        self.trailing_newlines = 1
+                    else:
+                        self.trailing_newlines = 2
+
+            else:
+                record.error('missing header', WarcRecord.CONTENT_LENGTH)
+                self.trailing_newlines = 2
+
+            # Fixed: READLINE BUG - eats trailing terminating newlines when content doesn't have a \n
+
+            #print 'read content', repr(line)
+            # have read trailing newlines
+
+            # check mandatory headers
+            #   WARC-Type
+            #   WARC-Date WARC-Record-ID Content-Length
+
+            # ignore mandatory newlines for now
+            # because they are missing.
+            # instead we trim a number of them off the next
+            # parse
+
+            # we can't re-wind easily without wrapping
+            # every file handle
+
+            # not brilliant but hey-ho
+
+
+
+
+            return (record, (), offset)
+
+    def trim(self, stream):
+        """read the end of the file"""
+        newlines = self.trailing_newlines
+        self.trailing_newlines = 0
+        errors = []
+        if newlines:
+            line = stream.readline()
+            while line:
+                #print 'trimming', repr(line)
+                match = nl_rx.match(line)
+                if match:
+                    if match.group('nl') != '\x0d\x0a':
+                        errors.append(('incorrect trailing newline', match.group('nl')))
+                    newlines-=1
+                    #print 'newline'
+                    if newlines == 0:
+                        break
+
+                else:
+                    #print 'line', line, newlines
+                    newlines = 0
+                    errors.append(('trailing data after content', line))
+                line = stream.readline()
+            if newlines > 0:
+                errors+=('less than two terminating newlines at end of record, missing', newlines)
+
+        return errors
+
+
+
+blank_rx = rx(r'^$')
+register_record_type(version_rx, WarcRecord)
+register_record_type(blank_rx, WarcRecord)
+
+def make_response(id, date, url, content, request_id):
+    headers = [
+            (WarcRecord.TYPE, WarcRecord.RESPONSE),
+            (WarcRecord.ID, id),
+            (WarcRecord.DATE, date),
+            (WarcRecord.URL, url),
+
+    ]
+    if request_id:
+        headers.append((WarcRecord.CONCURRENT_TO, request_id))
+
+    record=WarcRecord(headers=headers, content=content)
+
+    return record
+
+def make_request(request_id, date, url, content, response_id):
+    headers = [
+            (WarcRecord.TYPE, WarcRecord.REQUEST),
+            (WarcRecord.ID, request_id),
+            (WarcRecord.DATE, date),
+            (WarcRecord.URL, url),
+
+    ]
+    if response_id:
+        headers.append((WarcRecord.CONCURRENT_TO, response_id))
+
+    record=WarcRecord(headers=headers, content=content)
+
+    return record
+
+def make_metadata(meta_id, date, content, concurrent_to=None, url=None):
+    headers = [
+            (WarcRecord.TYPE, WarcRecord.METADATA),
+            (WarcRecord.ID, meta_id),
+            (WarcRecord.DATE, date),
+
+    ]
+    if concurrent_to:
+        headers.append((WarcRecord.CONCURRENT_TO, concurrent_to))
+
+    if url:
+        headers.append((WarcRecord.URL, url))
+
+    record=WarcRecord(headers=headers, content=content)
+
+    return record
+
+
+def make_conversion(conv_id, date, content, refers_to=None, url=None):
+    headers = [
+            (WarcRecord.TYPE, WarcRecord.CONVERSION),
+            (WarcRecord.ID, conv_id),
+            (WarcRecord.DATE, date),
+
+    ]
+    if refers_to:
+        headers.append((WarcRecord.REFERS_TO, refers_to))
+
+    if url:
+        headers.append((WarcRecord.URL, url))
+
+    record=WarcRecord(headers=headers, content=content)
+
+    return record
+
+
+
+def warc_datetime_str(d):
+    s = d.isoformat()
+    if '.' in s:
+        s = s[:s.find('.')]
+    return s +'Z'