By: Jayant Pahuja & Spongebob
Data Expert @OpenBudgetsIndia
Core Team Member @DataKindBlR
Image('govt_budget_portals.png')
Image('whaat.jpg')
Image('kn_sample_image.png')
Image('west_bengal_sample.png')
Image('gj_sample_image.png')
Concept: Detect Rows and Columns based on the lines detected and parse the infromation to extract tables in csv format.
import cv2
import subprocess
# Converting a pdf page into an Image for processing
def get_page_image_from_pdf(page_num, image_file_name, pdf_file_path):
'''
Convert a pdf page into an image using imagemagick's convert utility
'''
command = "convert -density 300 '%s'[%s] '%s'" % (pdf_file_path,
page_num,
image_file_name)
subprocess.check_output(command, shell=True)
return cv2.imread(image_file_name, 0)
kn_page_image = get_page_image_from_pdf(11, 'kn_sample_image.png', kn_pdf_file_path)
plt.figure(figsize=(30,20))
plt.imshow(kn_page_image, cmap='gray')
<matplotlib.image.AxesImage at 0x7f101ad531d0>
To detect lines we use openCV's Hough Transform Implementation
cv2.HoughLines(), cv2.HoughLinesP()
Image('lines.jpg')
edges = cv2.Canny(kn_page_image, 120, 150, apertureSize=3)
plot_page(edges)
lines = cv2.HoughLines(edges,1,np.pi/180,200)
lines[:5]
array([[[ 4.42000000e+02, 1.57079637e+00]], [[ 2.26000000e+03, 1.57079637e+00]], [[ 5.35000000e+02, 1.57079637e+00]], [[ 1.76000000e+03, 1.57079637e+00]], [[ 1.83500000e+03, 1.57079637e+00]]], dtype=float32)
HTML('<img src="http://opencv-python-tutroals.readthedocs.io/en/latest/_images/houghlinesdemo.gif">')
def get_straight_lines(img, aperture_size=3):
'''Extract long straight lines using Probabilistic Hough Transform
'''
edges = cv2.Canny(img, 120, 150, apertureSize=aperture_size)
min_line_length = 100
max_line_gap = 100
lines = cv2.HoughLinesP(edges, 1, np.pi/180, 80, min_line_length,
max_line_gap)
return lines
lines = get_straight_lines(kn_page_image)
plot_lines_on_image(lines, kn_page_image)
table_limits = get_table_limits(kn_page_image, lines, False)
plot_page(extend_lines_for_table(kn_page_image, lines, False, table_limits))
print("tabula --pages 4 --area top,left,bottom,right --columns c1,c2,c3,c4 path/to/pdf")
Image("spongebob.jpg")
tabula --pages 4 --area top,left,bottom,right --columns c1,c2,c3,c4 path/to/pdf
Image('west_bengal_sample.png')
Image('subject2.jpg')
Image('subject1.jpg')
Image('subject3.jpg')
We use a modified version of the Run Length Smoothing Algorithm (RLSA)
RLSA convert a byte sequence `x` into `y` based on two rules : -
1. 0's in x are changed to 1 if their is 1 in the next C adjacent pixels.
2. 1's remain unchanged.
C here is a threshold
source: http://crblpocr.blogspot.in/2007/06/run-length-smoothing-algorithm-rlsa.html
threshold = (25, 20)
wb_image_rlsa = rlsa_2d(wb_page_image, threshold)
plot_page(wb_image_rlsa)
def generate_blocks_dilation(img):
kernel = np.ones((5,10),np.uint8)
ret,thresh1 = cv2.threshold(img, 0, 1, cv2.THRESH_BINARY_INV)
return cv2.dilate(thresh1,kernel,iterations = 5)
plot_page(generate_blocks_dilation(wb_page_image))
Use cv2.connectedComponentsWithStats()
import pandas as pd
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(wb_image_rlsa)
block_stats = get_block_stats(stats, centroids)
block_stats['right'] = block_stats.left + block_stats.width
block_stats['bottom'] = block_stats.top + block_stats.height
block_stats.head(10)
left | top | width | height | area | centroid_x | centroid_y | right | bottom | |
---|---|---|---|---|---|---|---|---|---|
1 | 0 | 0 | 25 | 20 | 500 | 12.000000 | 9.500000 | 25 | 20 |
2 | 130 | 0 | 2188 | 20 | 43760 | 1223.500000 | 9.500000 | 2318 | 20 |
3 | 0 | 105 | 25 | 54 | 1350 | 12.000000 | 131.500000 | 25 | 159 |
4 | 906 | 105 | 643 | 54 | 34128 | 1226.833304 | 131.129249 | 1549 | 159 |
5 | 0 | 169 | 25 | 49 | 1225 | 12.000000 | 193.000000 | 25 | 218 |
6 | 833 | 169 | 789 | 49 | 35613 | 1227.420240 | 191.262853 | 1622 | 218 |
7 | 0 | 242 | 25 | 22 | 550 | 12.000000 | 252.500000 | 25 | 264 |
8 | 143 | 242 | 2168 | 22 | 47696 | 1226.500000 | 252.500000 | 2311 | 264 |
9 | 0 | 282 | 25 | 54 | 1350 | 12.000000 | 308.500000 | 25 | 336 |
10 | 1579 | 282 | 133 | 54 | 5931 | 1644.686562 | 307.158827 | 1712 | 336 |
Identifying the layout of the table, where the elements of the table are :-
We use poppler's version of `pdftotext`
def get_text(page_num, pdf_file_name, x, y, w, h):
command_text = 'pdftotext -enc UTF-8 -f {0} -l {0} -x {1} -y {2} -W {3} -H {4} "{5}" -'
command = command_text.format(page_num + 1, int(x),
int(y), int(w),
int(h), pdf_file_name)
return subprocess.check_output(command, shell=True)
block_stats = block_stats.apply(get_text_data, axis=1, args=[33, wb_pdf_file_path, horizontal_ratio, vertical_ratio])
block_stats[block_stats.text_length > 0][text_columns].sample(5)
text | text_length | comma_separated_numbers_present | is_text | number | |
---|---|---|---|---|---|
61 | 1,98,00,000 | 11 | False | False | 19800000.0 |
83 | ... | 3 | False | False | NaN |
91 | 4,96,66,000 | 11 | False | False | 49666000.0 |
159 | ... | 3 | False | False | NaN |
105 | 4,55,65,000 | 11 | False | False | 45565000.0 |
block_stats[['text', 'label']].sample(10)
text | label | |
---|---|---|
137 | 02-W.B.H.S. 2008 | cell_grouping |
59 | Total - 2013-00-800-NP-001-13 | NaN |
103 | Voted | NaN |
24 | Rs. | NaN |
52 | 66,33,527 | cell_values |
68 | 65,000 | cell_values |
134 | ... | cell_values |
82 | 13,08,000 | cell_values |
145 | 70-Deduct Recoveries | NaN |
167 | 001-Sumptuary and Other Allowances [CL] | NaN |
Image('rules.jpg')
block_stats.head()
area | bottom | centroid_x | centroid_y | comma_separated_numbers_present | header_index | height | is_text | label | left | number | pos | possible_row_merger | right | text | text_length | top | width | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 34128.0 | 159.0 | 1226.833304 | 131.129249 | 0.0 | NaN | 54.0 | 1.0 | title | 906.0 | NaN | 4.0 | 0.0 | 1549.0 | REVENUE EXPENDITURE | 19.0 | 105.0 | 643.0 |
6 | 35613.0 | 218.0 | 1227.420240 | 191.262853 | 0.0 | NaN | 49.0 | 1.0 | title | 833.0 | NaN | 6.0 | 0.0 | 1622.0 | DETAILED ACCOUNT - MAJOR HEAD 2013 | 34.0 | 169.0 | 789.0 |
10 | 5931.0 | 336.0 | 1644.686562 | 307.158827 | 0.0 | NaN | 54.0 | 1.0 | NaN | 1579.0 | NaN | 10.0 | 0.0 | 1712.0 | Budget | 6.0 | 282.0 | 133.0 |
11 | 6180.0 | 328.0 | 1927.556472 | 305.665210 | 0.0 | NaN | 46.0 | 1.0 | NaN | 1856.0 | NaN | 11.0 | 0.0 | 2001.0 | Revised | 7.0 | 282.0 | 145.0 |
12 | 5931.0 | 336.0 | 2222.686562 | 307.158827 | 0.0 | NaN | 54.0 | 1.0 | NaN | 2157.0 | NaN | 12.0 | 0.0 | 2290.0 | Budget | 6.0 | 282.0 | 133.0 |
print('Source: https://twitter.com/fatiichi/status/466398802476548096')
Image('googly_eyes.jpg')
Source: https://twitter.com/fatiichi/status/466398802476548096
Image('static_qr_code_without_logo.jpg')