This is for all intents and purposes, the initial commit. In reality,

there were several commits preceding this one in a now deleted branch for the purposes of configuring and testing against CI.
sahifedp · Dec 27, 2014 · 4846de0 · 4846de0
commit 4846de0
Show file tree

Hide file tree

Showing 117 changed files with 85,239 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,32 @@
+language: python
+python:
+  - "2.7"
+
+install:
+  # install dependencies
+  - sudo which python
+  - sudo apt-get install cython python-ipy
+
+  # build streamhtmlparser and python module
+  - cd include/streamhtmlparser
+  - ./configure
+  - sudo make
+  - sudo make install
+  - cd src/py-streamhtmlparser
+  - sudo make
+  - sudo make install
+  - cd ../../../../../
+
+  # put streamhtmlparser in the proper environment
+  - sudo cp -R /usr/local/lib/python2.7/dist-packages/* /usr/lib/python2.7/dist-packages
+  - sudo ldconfig -v
+
+  # build runtime environment
+  - sudo mkdir /var/log/swiperproxy
+  - sudo addgroup --system swiperproxy
+  - sudo adduser --system swiperproxy --ingroup=swiperproxy --no-create-home
+  - sudo chown -R swiperproxy:swiperproxy swiperproxy
+  - sudo chown swiperproxy:swiperproxy /var/log/swiperproxy
+
+script:
+  - sudo start-stop-daemon --start --background --pidfile /var/run/swiperproxy.pid --make-pidfile --user swiperproxy --chuid swiperproxy --startas swiperproxy/Proxy.py -- -c swiperproxy/proxy.conf
diff --git a/Buffer.py b/Buffer.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2014 SwiperProxy Team
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish
+# distribute, sublicense and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject
+# to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
+class Buffer(object):
+    MAX_BUFFER=1024*32
+    def __init__(self, max_size=MAX_BUFFER):
+        self.buffers = []
+        self.max_size = max_size
+        self.closing = False
+        self.eof = False
+        self.read_pos = 0
+        self.write_pos = 0
+
+    def write(self, data):
+        try:
+            if not self.buffers:
+                self.buffers.append(StringIO())
+                self.write_pos = 0
+            buffer = self.buffers[-1]
+            buffer.seek(self.write_pos)
+            buffer.write(data)
+            if buffer.tell() >= self.max_size:
+                buffer = StringIO()
+                self.buffers.append(buffer)
+            self.write_pos = buffer.tell()
+        finally:
+            pass
+
+    def read(self, length=-1):
+        read_buf = StringIO()
+        try:
+            remaining = length
+            while True:
+                if not self.buffers:
+                    break
+                buffer = self.buffers[0]
+                buffer.seek(self.read_pos)
+                read_buf.write(buffer.read(remaining))
+                self.read_pos = buffer.tell()
+                if length == -1:
+                    # We did not limit the read, we exhausted the
+                    # buffer, so delete it and keep reading from
+                    # remaining buffers.
+                    del self.buffers[0]
+                    self.read_pos = 0
+                else:
+                    # We limited the read so either we exhausted the
+                    # buffer or not:
+                    remaining = length - read_buf.tell()
+                    if remaining > 0:
+                        # Exhausted, remove buffer, read more. Keep
+                        # reading from remaining buffers.
+                        del self.buffers[0]
+                        self.read_pos = 0
+                    else:
+                        # Did not exhaust buffer, but read all that
+                        # was requested. Break to stop reading and
+                        # return data of requested length.
+                        break
+        finally:
+            pass
+        return read_buf.getvalue()
+
+    def flush(self):
+        pass
+
+    def __len__(self):
+        len = 0
+        try:
+            for buffer in self.buffers:
+                buffer.seek(0, 2)
+                if buffer == self.buffers[0]:
+                    len += buffer.tell() - self.read_pos
+                else:
+                    len += buffer.tell()
+            return len
+        finally:
+            pass
+
+    def close(self):
+        self.eof = True
diff --git a/CSSPage.py b/CSSPage.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2014 SwiperProxy Team
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish
+# distribute, sublicense and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject
+# to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import re
+import Util
+
+class CSSPage:
+    """
+    Used for a CSS stylesheet. Uses the reader function to read a
+    block, rewrites that block and writes it to the client using the
+    writer function.
+    """
+    BLKSIZE=65536
+
+    def __init__(self, config, ssl, reader, writer, remote_host):
+        self.config = config
+        self.ssl = ssl
+        self.reader = reader
+        self.writer = writer
+        self.input_buffer = ''
+        self.output_buffer = ''
+        self.remote_host = remote_host
+
+    def rewrite_re(self, m):
+        part1 = m.group(1) or ''
+        scheme = m.group(6) or ''
+        url = m.group(7) or ''
+        closer = m.group(9) or ''
+
+        return part1 + Util.rewrite_URL(scheme+"//"+url, self.config, self.ssl,
+                                        self.remote_host) + closer
+
+    def rewrite(self):
+        pattern = r"(((background(-image)?\s*:)|@import)\s*(url)?\s*[('\"]+\s*)(https?:)?//([^\"')]+)(:\d+)?([)'\"]+)"
+
+        while True:
+            s = self.reader(self.BLKSIZE)
+            if not s or len(s) == 0:
+                # End of file, there may be a left-over in the input
+                # buffer.
+                self.output_buffer += self.input_buffer
+                self.write_output(True)
+                break
+
+            self.input_buffer += s
+
+            news = re.sub(pattern, self.rewrite_re, self.input_buffer,
+                          re.I|re.M|re.S)
+
+            # It may be the case that the background image string is
+            # divided over two blocks. Keep the last 1024 bytes in the
+            # input buffer and write everything up to that point to the
+            # output buffer
+            if len(news) > 1024:
+                self.output_buffer += news[:-1024]
+                self.input_buffer = news[-1024:]
+                self.write_output(False)
+            else:
+                self.output_buffer += news
+                self.input_buffer = ''
+                self.write_output(False)
+
+    def write_output(self, final):
+        length = len(self.output_buffer)
+        for beg in range(0, length, self.BLKSIZE):
+            end = beg + self.BLKSIZE
+            if end > length:
+                if not final:
+                    self.output_buffer = self.output_buffer[beg:]
+                    return
+                end = length
+            self.writer(self.output_buffer[beg:end])
+
+        self.output_buffer = ''
diff --git a/JSPage.py b/JSPage.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2014 SwiperProxy Team
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish
+# distribute, sublicense and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject
+# to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import re
+
+BLKSIZE=65536
+
+# A string without unescaped quote characters, followed by a quote.
+re_scan = re.compile(r"(([^\\\"']|\\.)*)['\"]")
+
+# A URL or hostname to rewrite.
+re_url = re.compile(r"(https?:\\?/\\?/)?([a-zA-Z0-9\-\.]+\.(AC|AD|AE|AERO|AF|AG|AI|AL|AM|AN|AO|AQ|AR|ARPA|AS|ASIA|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BIZ|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CAT|CC|CD|CF|CG|CH|CI|CK|CL|CM|CN|CO|COM|COOP|CR|CU|CV|CW|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EDU|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|GF|GG|GH|GI|GL|GM|GN|GOV|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|INFO|INT|IO|IQ|IR|IS|IT|JE|JM|JO|JOBS|JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MIL|MK|ML|MM|MN|MO|MOBI|MP|MQ|MR|MS|MT|MU|MUSEUM|MV|MW|MX|MY|MZ|NA|NAME|NC|NE|NET|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|ORG|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|PRO|PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SX|SY|SZ|TC|TD|TEL|TF|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TRAVEL|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|XN--0ZWM56D|XN--11B5BS3A9AJ6G|XN--3E0B707E|XN--45BRJ9C|XN--80AKHBYKNJ4F|XN--80AO21A|XN--90A3AC|XN--9T4B11YI5A|XN--CLCHC0EA0B2G2A9GCD|XN--DEBA0AD|XN--FIQS8S|XN--FIQZ9S|XN--FPCRJ9C3D|XN--FZC2C9E2C|XN--G6W251D|XN--GECRJ9C|XN--H2BRJ9C|XN--HGBK6AJ7F53BBA|XN--HLCJ6AYA9ESC7A|XN--J6W193G|XN--JXALPDLP|XN--KGBECHTV|XN--KPRW13D|XN--KPRY57D|XN--LGBBAT1AD8J|XN--MGBAAM7A8H|XN--MGBAYH7GPA|XN--MGBBH1A71E|XN--MGBC0A9AZCG|XN--MGBERP4A5D4AR|XN--O3CW4H|XN--OGBPF8FL|XN--P1AI|XN--PGBS0DH|XN--S9BRJ9C|XN--WGBH1C|XN--WGBL6A|XN--XKC2AL3HYE2A|XN--XKC2DL3A5EE0H|XN--YFRO4I67O|XN--YGBI2AMMX|XN--ZCKZAH|XXX|YE|YT|ZA|ZM|ZW))(?![a-zA-Z0-9\-\.])(:\d+)?", re.I)
+
+class JSPage(object):
+    def __init__(self, config, ssl, reader, writer, remote_host):
+        self.config = config
+        self.ssl = ssl
+        self.reader = reader
+        self.writer = writer
+        self.output_buffer = []
+        self.input_buffer = ""
+        self.input_pos = 0
+        self.eof = False
+        self.output_size = 0
+        self.remote_host = remote_host
+
+    def read_some(self):
+        """
+        Read a block of data into the input buffer. Discard any data in
+        the input buffer that has already been processed. Set the EOF
+        marker if there is no more input.
+        """
+        if self.eof: return
+        s = self.reader(BLKSIZE)
+        if not s:
+            self.eof = True
+            return
+        self.input_buffer = self.input_buffer[self.input_pos:] + s
+        self.input_pos = 0
+
+    # Put a string into the output buffer. If the total length of the
+    # output buffer is at least BLKSIZE, write it to the output stream.
+    def output(self, s):
+        """
+        Put a string into the output buffer. If the total length of the
+        output buffer is at least BLKSIZE, write it to the output
+        stream.
+        """
+        self.output_buffer.append(s)
+        l = self.output_size
+        l += len(s)
+        if l >= BLKSIZE:
+            self.writer("".join(self.output_buffer))
+            self.output_buffer = []
+            self.output_size = 0
+        else:
+            self.output_size = l
+
+    def flush(self):
+        """
+        At the end, flush any remaining data in the output buffer.
+        """
+        self.writer("".join(self.output_buffer))
+
+    def rewrite_part(self, s):
+        m = re_url.match(s)
+        if not m: return s
+
+        hostname = m.group(2)
+        if hostname.lower().endswith(self.config.hostname):
+            return s
+        scheme = m.group(1) or ''
+
+        if self.ssl:
+            port = self.config.https_port
+        else:
+            port = self.config.http_port
+
+        # Not necessary to use standard port numbers. Assume proxy is
+        # not doing HTTP on 443 or HTTPS on 80.
+        if port == 80 or port == 443:
+            portstr = ''
+        else:
+            portstr = ':' + str(port)
+
+        if scheme:
+            s = "".join((scheme, hostname, ".", self.config.hostname,
+                         portstr, s[m.end():]))
+        else:
+            s = "".join((scheme, hostname, ".", self.config.hostname,
+                         m.group(4) or '', s[m.end():]))
+
+        return s
+
+    def rewrite(self):
+        max_page_size = self.config.max_page_size
+
+        # Read the first block to make sure there is some data to work
+        # with.
+        self.read_some()
+
+        while True:
+            s = self.input_buffer
+            p = self.input_pos
+
+            # Too much data without a quoted string match: stop and
+            # flush the remainder.
+            if len(s) >= max_page_size: break
+
+            # Find the next unescaped quote character.
+            m = re_scan.match(s, p)
+            if not m:
+                # None found. If there is more input, read another
+                # chunk of data and try again.
+                if self.eof: break
+                self.read_some()
+                continue
+
+            # Rewrite a possible URL or hostname in the part, and
+            # advance to the next position. For efficiency, the quote
+            # is included in the string passed to rewrite_part, but
+            # this is harmless because it will always be copied to the
+            # output anyway.
+            self.output(self.rewrite_part(m.group()))
+            self.input_pos = m.end()
+
+        # Write whatever is left in the input buffer, and flush the
+        # output stream.
+        self.output(s[p:])
+        self.flush()