Skip to content

Commit

Permalink
Modified code and README
Browse files Browse the repository at this point in the history
  • Loading branch information
KananVyas committed Jun 29, 2020
1 parent 4d0aa27 commit 853c973
Show file tree
Hide file tree
Showing 17 changed files with 36 additions and 18 deletions.
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
26 changes: 17 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@

# A Box detection algorithm for tabularized data.

When you are working with Optical character recognition(OCR) or any data or
object recognition problem, the first thing to do is preprocessing. Here preprocessing means
to extract the location where our information is located. After extracting the location,
any machine algorithm will be performed on that image. 
- This code is used to extract data which is in tabular format using image processing techniques.
- When you are working with Optical character recognition(OCR) or any data or object recognition problem, the first thing to do is preprocessing.
- Here preprocessing means to extract the box where our data is located. After extracting the boxes, any OCR algorithm can be performed on those crops for recognition.
![alt text](http://github.com/KananVyas/BoxDetection/blob/master/41.jpg)

The problem arises when you have to detect objects which are located in any tables/boxes or
in row-column format. If the image is like this then you have to detect boxes and extract them one by one.
- The problem arises when you have to detect objects which are located in any tables/boxes or in row-column format. If the image is like this then you have to detect boxes and extract them one by one.
Now it should be done accurately for all images.

This algorithm helps to detect every boxes accurately and save it in a "Cropped" folder.The code is shown in box_detection.py
and the test image is "41.jpg".


you can see the medium blog for this code: https://medium.com/@kananvyas/a-box-detection-algorithm-for-any-image-containing-boxes-756c15d7ed26
This algorithm helps to detect every boxes accurately and save it in a `/Output/` folder. The code is shown in `src/box_detection.py`

**USAGE:**
- Run `python src/box_detection.py`
- You can see the output crops on `/Output` folder
![alt text](http://github.com/KananVyas/BoxDetection/blob/master/Output/1.png)
![alt text](http://github.com/KananVyas/BoxDetection/blob/master/Output/2.png)

You can also read the medium article for understanding about the algorithm: https://medium.com/@kananvyas/a-box-detection-algorithm-for-any-image-containing-boxes-756c15d7ed26
6 changes: 3 additions & 3 deletions box_detection.ipynb → src/box_detection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"(thresh, img_bin) = cv2.threshold(img, 128, 255,cv2.THRESH_BINARY| cv2.THRESH_OTSU)\n",
"# Invert the image\n",
"img_bin = 255-img_bin \n",
"cv2.imwrite(\"Image_bin.jpg\",img_bin)\n",
"cv2.imwrite(\"Images/Image_bin.jpg\",img_bin)\n",
"plt.axis('off')\n",
"plt.imshow(img_bin);"
]
Expand Down Expand Up @@ -88,7 +88,7 @@
" # Morphological operation to detect vertical lines from an image\n",
" img_temp1 = cv2.erode(img_bin, verticle_kernel, iterations=3)\n",
" verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations=3)\n",
" cv2.imwrite(\"verticle_lines.jpg\",verticle_lines_img)\n",
" cv2.imwrite(\"Images/verticle_lines.jpg\",verticle_lines_img)\n",
" # Morphological operation to detect horizontal lines from an image\n",
" img_temp2 = cv2.erode(img_bin, hori_kernel, iterations=3)\n",
" horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=3)\n",
Expand Down Expand Up @@ -240,4 +240,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
22 changes: 16 additions & 6 deletions box_detection.py → src/box_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,20 @@ def sort_contours(cnts, method="left-to-right"):
# return the list of sorted contours and bounding boxes
return (cnts, boundingBoxes)


#Functon for extracting the box
def box_extraction(img_for_box_extraction_path, cropped_dir_path):

print("Reading image..")
img = cv2.imread(img_for_box_extraction_path, 0) # Read the image
(thresh, img_bin) = cv2.threshold(img, 128, 255,
cv2.THRESH_BINARY | cv2.THRESH_OTSU) # Thresholding the image
img_bin = 255-img_bin # Invert the image

cv2.imwrite("Image_bin.jpg",img_bin)

print("Storing binary image to Images/Image_bin.jpg..")
cv2.imwrite("Images/Image_bin.jpg",img_bin)

print("Applying Morphological Operations..")
# Defining a kernel length
kernel_length = np.array(img).shape[1]//40

Expand All @@ -46,12 +51,12 @@ def box_extraction(img_for_box_extraction_path, cropped_dir_path):
# Morphological operation to detect verticle lines from an image
img_temp1 = cv2.erode(img_bin, verticle_kernel, iterations=3)
verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations=3)
cv2.imwrite("verticle_lines.jpg",verticle_lines_img)
cv2.imwrite("Images/verticle_lines.jpg",verticle_lines_img)

# Morphological operation to detect horizontal lines from an image
img_temp2 = cv2.erode(img_bin, hori_kernel, iterations=3)
horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=3)
cv2.imwrite("horizontal_lines.jpg",horizontal_lines_img)
cv2.imwrite("Images/horizontal_lines.jpg",horizontal_lines_img)

# Weighting parameters, this will decide the quantity of an image to be added to make a new image.
alpha = 0.5
Expand All @@ -63,13 +68,16 @@ def box_extraction(img_for_box_extraction_path, cropped_dir_path):

# For Debugging
# Enable this line to see verticle and horizontal lines in the image which is used to find boxes
cv2.imwrite("img_final_bin.jpg",img_final_bin)
print("Binary image which only contains boxes: Images/img_final_bin.jpg")
cv2.imwrite("Images/img_final_bin.jpg",img_final_bin)
# Find contours for image, which will detect all the boxes
contours, hierarchy = cv2.findContours(
img_final_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# Sort all the contours by top to bottom.
(contours, boundingBoxes) = sort_contours(contours, method="top-to-bottom")

print("Output stored in Output directiory!")

idx = 0
for c in contours:
# Returns the location and width,height for every contour
Expand All @@ -87,4 +95,6 @@ def box_extraction(img_for_box_extraction_path, cropped_dir_path):
# cv2.imwrite("./Temp/img_contour.jpg", img)


box_extraction("41.jpg", "./Cropped/")

#Input image path and out folder
box_extraction("41.jpg", "./Output/")

0 comments on commit 853c973

Please sign in to comment.