Init Commit

dominhhai · Jun 19, 2018 · 6937f9a · 6937f9a
commit 6937f9a
Show file tree

Hide file tree

Showing 12 changed files with 677 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,115 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# train data
+data/captcha
+data/train
+
+# model log
+log
+.DS_Store
+
+# node.js
+node_modules/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2018 Do Minh Hai
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Readme.md b/Readme.md
@@ -0,0 +1,95 @@
+# Captcha Breaker
+Build with Tensorflow (ConvNets) and  Node.js. Perfect test (100%) on Amazon Captcha :muscle::muscle::muscle:
+
+# Installation
+#### Python packages
+```
+$ pip install -r requirements.txt
+```
+
+#### Node.js packages (Node.js user only)
+```
+$ npm i
+```
+
+# Usage
+## 1. Create train data
+#### Prepare your training dataset
+* Copy captcha image to `data/captcha` folder
+```
+|_data
+      |_captcha
+          |_ captcha_1.jpg
+          |_ captcha_2.jpg
+```
+* Create mapping file `data/captcha.json` to map your train image with corresponding label
+```json
+{
+    "captcha_1.jpg": "HEYMEN",
+    "captcha_2.jpg": "XINCHA"
+}
+```
+
+#### Build train data for model
+Run `src/create_train_data.py` will save your train data as `data/captcha.npz` compressed file.
+```
+$ python src/create_train_data.py
+```
+
+The compressed `data/captcha.npz` includes:
+* Train Data ( `x_train`, `y_train` ): `80%`
+* Test Data ( `x_test`, `y_test` ): `20%`
+
+## 2. Train
+Run `src/train.py` to train the model with your own dataset.
+```
+$ python src/train.py
+```
+
+Take :coffee: or :tea: while waiting!
+
+## 3. Attack
+Now, enjoy your war :fire::fire::fire: :stuck_out_tongue_winking_eye::stuck_out_tongue_winking_eye::stuck_out_tongue_winking_eye:
+
+#### Python
+```
+$ python src/predict --fname YOUR_IMAGE_PATH_or_URL
+```
+
+Sample output:
+```
+loading image: data/captcha/captcha_2.jpg
+load captcha classifier
+predict for 1 char: `X` with probability: 99.956%
+predict for 2 char: `I` with probability: 99.909%
+predict for 3 char: `N` with probability: 99.556%
+predict for 4 char: `C` with probability: 99.853%
+predict for 5 char: `H` with probability: 99.949%
+predict for 6 char: `A` with probability: 98.889%
+Captcha: `XINCHA` with confident: `99.686%`
+XINCHA
+```
+
+#### Node.js
+```js
+const captchaPredict = require('src/predict')
+
+captchaPredict(YOUR_IMAGE_PATH_or_URL)
+  .then(console.log)
+  .catche(console.error)
+```
+Sample output:
+```
+[
+  "loading image: data/captcha/captcha_2.jpg",
+  "load captcha classifier",
+  "predict for 1 char: `X` with probability: 99.956%",
+  "predict for 2 char: `I` with probability: 99.909%",
+  "predict for 3 char: `N` with probability: 99.556%",
+  "predict for 4 char: `C` with probability: 99.853%",
+  "predict for 5 char: `H` with probability: 99.949%",
+  "predict for 6 char: `A` with probability: 98.889%",
+  "Captcha: `XINCHA` with confident: `99.686%`",
+  "XINCHA"
+]
+```
diff --git a/matplotlibrc b/matplotlibrc
@@ -0,0 +1 @@
+backend: TkAgg
diff --git a/package.json b/package.json
@@ -0,0 +1,23 @@
+{
+  "name": "captcha-breaker",
+  "version": "1.0.0",
+  "description": "Breaking Captcha with Tensorflow",
+  "main": "src/predict.js",
+  "dependencies": {
+    "python-shell": "^0.5.0"
+  },
+  "devDependencies": {},
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/dominhhai/captcha-breaker.git"
+  },
+  "author": "Do Minh Hai",
+  "license": "MIT",
+  "bugs": {
+    "url": "https://github.com/dominhhai/captcha-breaker/issues"
+  },
+  "homepage": "https://github.com/dominhhai/captcha-breaker#README"
+}
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,37 @@
+absl-py==0.2.2
+astor==0.6.2
+backports.functools-lru-cache==1.5
+backports.weakref==1.0.post1
+bleach==1.5.0
+cloudpickle==0.5.3
+cycler==0.10.0
+dask==0.18.0
+decorator==4.3.0
+enum34==1.1.6
+funcsigs==1.0.2
+futures==3.2.0
+gast==0.2.0
+grpcio==1.12.1
+html5lib==0.9999999
+kiwisolver==1.0.1
+Markdown==2.6.11
+matplotlib==2.2.2
+mock==2.0.0
+networkx==2.1
+numpy==1.14.5
+pbr==4.0.4
+Pillow==5.1.0
+protobuf==3.6.0
+pyparsing==2.2.0
+python-dateutil==2.7.3
+pytz==2018.4
+PyWavelets==0.5.2
+scikit-image==0.14.0
+scipy==1.1.0
+six==1.11.0
+subprocess32==3.5.2
+tensorboard==1.8.0
+tensorflow==1.8.0
+termcolor==1.1.0
+toolz==0.9.0
+Werkzeug==0.14.1
diff --git a/src/create_train_data.py b/src/create_train_data.py
@@ -0,0 +1,56 @@
+from __future__ import print_function, absolute_import, division
+import os
+import json
+from skimage import io
+from img import split_letters
+import numpy as np
+
+DATA_DIR = 'data'
+DATA_MAP = os.path.join(DATA_DIR, 'captcha.json')
+DATA_FULL_DIR = os.path.join(DATA_DIR, 'captcha')
+DATA_TRAIN_DIR = os.path.join(DATA_DIR, 'train')
+DATA_TRAIN_FILE = os.path.join(DATA_DIR, 'captcha')
+
+# array of tuple of binary image and label
+data_x = []
+data_y = []
+
+# load image content json file
+with open(DATA_MAP) as f:
+    image_contents = json.load(f)
+
+# load image and save letters
+counter = 0
+for fname, contents in image_contents.iteritems():
+    counter += 1
+    print(counter, fname, contents)
+    image = io.imread(os.path.join(DATA_FULL_DIR, fname))
+
+    # split image
+    letters = split_letters(image, debug=True)
+    if letters != None:
+        fname = fname.replace('.jpg', '.png')
+        for i, letter in enumerate(letters):
+            content = contents[i]
+            # add to dataset
+            data_x.append(letter)
+            data_y.append(np.uint8(ord(content) - 65)) # 65: 'A'
+
+            # save letter into train folder
+            fpath = os.path.join(DATA_TRAIN_DIR, content)
+            if not os.path.exists(fpath):
+                os.makedirs(fpath)
+            letter_fname = os.path.join(fpath, str(i+1) + '-' + fname)
+            io.imsave(letter_fname, 255 - letter) # invert black <> white color
+    else:
+        print('Letters is not valid')
+        break
+
+# split into train and test data set
+train_num = int(len(data_y) * 0.8) # 80%
+
+# save train data
+print('saving dataset')
+np.savez_compressed(DATA_TRAIN_FILE,
+    x_train=data_x[:train_num], y_train=data_y[:train_num],
+    x_test=data_x[train_num:], y_test=data_y[train_num:])