来源:https://juejin.cn/post/7006732549451939847
亮出效果
实现步骤
基本思路
其实,搞定两点就成,第一是能识别数字,第二是能切分数字。
对于图像识别,一般的套路是下面这样的(CNN卷积神经网络):
对于图像切割,一般的套路是下面的这样(横向纵向投影法):
2.1 准备数据
对于男友,找一个油嘴滑舌的花花公子,不如找一个闷葫芦IT男,亲手把他培养成你期望的样子。
索引:0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
字符:0 1 2 3 4 5 6 7 8 9 = + - × ÷
2.1.1 准备字体
2.1.2 生成图片
from __future__ import print_function
from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw
import os
import shutil
import time
label_dict = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '=', 11: '+', 12: '-', 13: '×', 14: '÷'}
for value,char in label_dict.items():
train_images_dir = "dataset"+"/"+str(value)
if os.path.isdir(train_images_dir):
shutil.rmtree(train_images_dir)
os.makedirs(train_images_dir)
def makeImage(label_dict, font_path, width=24, height=24, rotate = 0):
# 从字典中取出键值对
for value,char in label_dict.items():
# 创建一个黑色背景的图片,大小是24*24
img
= Image.new(
"RGB"
, (width, height),
"black"
)
draw = ImageDraw.Draw(img)
font = ImageFont.truetype(font_path, int(width*0.9))
font_width, font_height = draw.textsize(char, font)
x = (width - font_width-font.getoffset(char)[0]) / 2
y = (height - font_height-font.getoffset(char)[1]) / 2
draw.text((x,y), char, (255, 255, 255), font)
img = img.rotate(rotate)
time_value = int(round(time.time() * 1000))
img_path = "dataset/{}/img-{}_r-{}_{}.png".format(value,value,rotate,time_value)
img.save(img_path)
font_dir = "./fonts"
for font_name in os.listdir(font_dir):
path_font_file = os.path.join(font_dir, font_name)
for k in range(-10, 10, 1):
# 每个字符都生成图片
makeImage(label_dict, path_font_file, rotate = k)
draw.text((x,y), char, (255, 255, 255), font)
如果代码你运行的没有问题,最终会生成如下结果:
2.2 训练数据
2.2.1 构建模型
你先看代码,外行感觉好深奥,内行偷偷地笑。
# %% 导入必要的包
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import pathlib
import cv2
# %% 构建模型
def create_model():
model = Sequential([
layers.experimental.preprocessing.Rescaling(1./255, input_shape=(24, 24, 1)),
layers.Conv2D(24,3,activation='relu'),
layers.MaxPooling2D((2,2)),
layers.Conv2D(64,3, activation='relu'),
layers.MaxPooling2D((2,2)),
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.Dense(15)]
)
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
return model
2.2.2 卷积层 Conv2D
各个职能部门的调查员,搜集和整理某单位区域内的特定数据。我们输入的是一个图像,它是由像素组成的,这就是R e s c a l i n g ( 1. / 255 , i n p u t s h a p e = ( 24 , 24 , 1 ) ) Rescaling(1./255, input_shape=(24, 24, 1))Rescaling(1./255,input shape=(24,24,1))中,input_shape输入形状是24*24像素1个通道(彩色是RGB 3个通道)的图像。
2.2.3 池化层 MaxPooling2D
说白了就是四舍五入。
2.2.4 全连接层 Dense
弱水三千,只取一瓢。
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
rescaling_2 (Rescaling) (None, 24, 24, 1) 0
_________________________________________________________________
conv2d_4 (Conv2D) (None, 22, 22, 24) 240
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 11, 11, 24) 0
_________________________________________________________________
conv2d_5 (Conv2D) (None, 9, 9, 64) 13888
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 4, 4, 64) 0
_________________________________________________________________
flatten_2 (Flatten) (None, 1024) 0
_________________________________________________________________
dense_4 (Dense) (None, 128) 131200
_________________________________________________________________
dense_5 (Dense) (None, 15) 1935
=================================================================
Total params: 147,263
Trainable params: 147,263
Non-trainable params: 0
_________________________________________________________________
2.2.5 训练数据
执行就完了。
# 统计文件夹下的所有图片数量
data_dir = pathlib.Path('dataset')
# 从文件夹下读取图片,生成数据集
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
data_dir, # 从哪个文件获取数据
color_mode="grayscale", # 获取数据的颜色为灰度
image_size=(24, 24), # 图片的大小尺寸
batch_size=32 # 多少个图片为一个批次
)
# 数据集的分类,对应dataset文件夹下有多少图片分类
class_names = train_ds.class_names
# 保存数据集分类
np.save("class_name.npy", class_names)
# 数据集缓存处理
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
# 创建模型
model = create_model()
# 训练模型,epochs=10,所有数据集训练10遍
model.fit(train_ds,epochs=10)
# 保存训练后的权重
model.save_weights('checkpoint/char_checkpoint')
Found 3900 files belonging to 15 classes.
Epoch 1/10 122/122 [=========] - 2s 19ms/step - loss: 0.5795 - accuracy: 0.8615
Epoch 2/10 122/122 [=========] - 2s 18ms/step - loss: 0.0100 - accuracy: 0.9992
Epoch 3/10 122/122 [=========] - 2s 19ms/step - loss: 0.0027 - accuracy: 1.0000
Epoch 4/10 122/122 [=========] - 2s 19ms/step - loss: 0.0013 - accuracy: 1.0000
Epoch 5/10 122/122 [=========] - 2s 20ms/step - loss: 8.4216e-04 - accuracy: 1.0000
Epoch 6/10 122/122 [=========] - 2s 18ms/step - loss: 5.5273e-04 - accuracy: 1.0000
Epoch 7/10 122/122 [=========] - 3s 21ms/step - loss: 4.0966e-04 - accuracy: 1.0000
Epoch 8/10 122/122 [=========] - 2s 20ms/step - loss: 3.0308e-04 - accuracy: 1.0000
Epoch 9/10 122/122 [=========] - 3s 23ms/step - loss: 2.3446e-04 - accuracy: 1.0000
Epoch 10/10 122/122 [=========] - 3s 21ms/step - loss: 1.8971e-04 - accuracy: 1.0000
char_checkpoint.data-00000-of-00001
char_checkpoint.index
checkpoint
专属福利
👉点击领取:Java资料合集!
2.3 预测数据
终于到了享受成果的时候了。
# 设置待识别的图片
img1=cv2.imread('img1.png',0)
img2=cv2.imread('img2.png',0)
imgs = np.array([img1,img2])
# 构建模型
model = create_model()
# 加载前期训练好的权重
model.load_weights('checkpoint/char_checkpoint')
# 读出图片分类
class_name = np.load('class_name.npy')
# 预测图片,获取预测值
predicts = model.predict(imgs)
results = [] # 保存结果的数组
for predict in predicts: #遍历每一个预测结果
index = np.argmax(predict) # 寻找最大值
result = class_name[index] # 取出字符
results.append(result)
print(results)
图片要通过cv2.imread('img1.png',0) 转化为二维数组结构,0参数是灰度图片。经过处理后,图片转成的数组是如下所示(24,24)的结构:
下面这张大图片,怎么把它搞一搞,搞成单个小数字的图片。
2.4 切割图像
2.4.1 投影大法
最有效的方法,往往都是用循环实现的。
import numpy as np
import cv2
from PIL import Image, ImageDraw, ImageFont
import PIL
import matplotlib.pyplot as plt
import os
import shutil
from numpy.core.records import array
from numpy.core.shape_base import block
import time
# 整幅图片的Y轴投影,传入图片数组,图片经过二值化并反色
def img_y_shadow(img_b):
### 计算投影 ###
(h,w)=img_b.shape
# 初始化一个跟图像高一样长度的数组,用于记录每一行的黑点个数
a=[0 for z in range(0,h)]
# 遍历每一列,记录下这一列包含多少有效像素点
for i in range(0,h):
for j in range(0,w):
if img_b[i,j]==255:
a[i]+=1
return a
# 展示图片
def img_show_array(a):
plt.imshow(a)
plt.show()
# 展示投影图, 输入参数arr是图片的二维数组,direction是x,y轴
def show_shadow(arr, direction = 'x'):
a_max = max(arr)
if direction == 'x': # x轴方向的投影
a_shadow = np.zeros((a_max, len(arr)), dtype=int)
for i in range(0,len(arr)):
if arr[i] == 0:
continue
for j in range(0, arr[i]):
a_shadow[j][i] = 255
elif direction == 'y': # y轴方向的投影
a_shadow = np.zeros((len(arr),a_max), dtype=int)
for i in range(0,len(arr)):
if arr[i] == 0:
continue
for j in range(0, arr[i]):
a_shadow[i][j] = 255
img_show_array(a_shadow)
# 读入图片
img_path = 'question.jpg'
img=cv2.imread(img_path,0)
thresh = 200
# 二值化并且反色
ret,img_b=cv2.threshold(img,thresh,255,cv2.THRESH_BINARY_INV)
img_y_shadow_a = img_y_shadow(img_b)
show_shadow(img_y_shadow_a, 'y') # 如果要显示投影
2.4.2 根据投影找区域
最有效的方法,往往还得用循环来实现。
# 图片获取文字块,传入投影列表,返回标记的数组区域坐标[[左,上,右,下]]
def img2rows(a,w,h):
### 根据投影切分图块 ###
inLine = False # 是否已经开始切分
start = 0 # 某次切分的起始索引
mark_boxs = []
for i in range(0,len(a)):
if inLine == False and a[i] > 10:
inLine = True
start = i
# 记录这次选中的区域[左,上,右,下],上下就是图片,左右是start到当前
elif i-start >5 and a[i] < 10 and inLine:
inLine = False
if i-start > 10:
top = max(start-1, 0)
bottom = min(h, i+1)
box = [0, top, w, bottom]
mark_boxs.append(box)
return mark_boxs
通过投影,计算哪些区域在一定范围内是连续的,如果连续了很长时间,我们就认为是同一区域,如果断开了很长一段时间,我们就认为是另一个区域。
(img_h,img_w)=img.shape
row_mark_boxs = img2rows(img_y_shadow_a,img_w,img_h)
print(row_mark_boxs)
2.4.3 根据区域切图片
最有效的方法,最终也得用循环来实现。这也是计算机体现它强大的地方。
# 裁剪图片,img 图片数组, mark_boxs 区域标记
def cut_img(img, mark_boxs):
img_items = [] # 存放裁剪好的图片
for i in range(0,len(mark_boxs)):
img_org = img.copy()
box = mark_boxs[i]
# 裁剪图片
img_item = img_org[box[1]:box[3], box[0]:box[2]]
img_items.append(img_item)
return img_items
# 保存图片
def save_imgs(dir_name, imgs):
if os.path.exists(dir_name):
shutil.rmtree(dir_name)
if not os.path.exists(dir_name):
os.makedirs(dir_name)
img_paths = []
for i in range(0,len(imgs)):
file_path = dir_name+'/part_'+str(i)+'.jpg'
cv2.imwrite(file_path,imgs[i])
img_paths.append(file_path)
return img_paths
# 切图并保存
row_imgs = cut_img(img, row_mark_boxs)
imgs = save_imgs('rows', row_imgs) # 如果要保存切图
print(imgs)
2.4.4 循环可去油腻
还是循环。横着行我们掌握了,那么针对每一行图片,我们竖着切成三块是不是也会了,一个道理。
kernel=np.ones((3,3),np.uint8) # 膨胀核大小
row_img_b=cv2.dilate(img_b,kernel,iterations=6) # 图像膨胀6次
def divImg(img_path, save_file = False):
img_o=cv2.imread(img_path,1)
# 读入图片
img=cv2.imread(img_path,0)
img.shape =
thresh = 200
# 二值化整个图,用于分行
cv2.threshold(img,thresh,255,cv2.THRESH_BINARY_INV) =
# 计算投影,并截取整个图片的行
img_y_shadow_a = img_y_shadow(img_b)
row_mark_boxs = img2rows(img_y_shadow_a,img_w,img_h)
# 切行的图片,切的是原图
row_imgs = cut_img(img, row_mark_boxs)
all_mark_boxs = []
all_char_imgs = []
# ===============从行切块======================
for i in range(0,len(row_imgs)):
row_img = row_imgs[i]
row_img.shape =
# 二值化一行的图,用于切块
cv2.threshold(row_img,thresh,255,cv2.THRESH_BINARY_INV) =
kernel=np.ones((3,3),np.uint8)
#图像膨胀6次
row_img_b_d=cv2.dilate(row_img_b,kernel,iterations=6)
img_x_shadow_a = img_x_shadow(row_img_b_d)
block_mark_boxs = row2blocks(img_x_shadow_a, row_img_w, row_img_h)
row_char_boxs = []
row_char_imgs = []
# 切块的图,切的是原图
block_imgs = cut_img(row_img, block_mark_boxs)
if save_file:
b_imgs = save_imgs('cuts/row_'+str(i), block_imgs) # 如果要保存切图
print(b_imgs)
# =============从块切字====================
for j in range(0,len(block_imgs)):
block_img = block_imgs[j]
block_img.shape =
# 二值化块,因为要切字符图片了
cv2.threshold(block_img,thresh,255,cv2.THRESH_BINARY_INV) =
block_img_x_shadow_a = img_x_shadow(block_img_b)
row_top = row_mark_boxs[i][1]
block_left = block_mark_boxs[j][0]
block2chars(block_img_x_shadow_a, block_img_w, block_img_h,row_top,block_left) =
row_char_boxs.append(abs_char_mark_boxs)
# 切的是二值化的图
char_imgs = cut_img(block_img_b, char_mark_boxs, True)
row_char_imgs.append(char_imgs)
if save_file:
c_imgs = save_imgs('cuts/row_'+str(i)+'/blocks_'+str(j), char_imgs) # 如果要保存切图
print(c_imgs)
all_mark_boxs.append(row_char_boxs)
all_char_imgs.append(row_char_imgs)
return all_mark_boxs,all_char_imgs,img_o
2.5 识别
循环,循环,还是TM循环!
all_mark_boxs,all_char_imgs,img_o = divImg(path,save)
model = cnn.create_model()
model.load_weights('checkpoint/char_checkpoint')
class_name = np.load('class_name.npy')
for i in range(0,len(all_char_imgs)):
row_imgs = all_char_imgs[i]
for j in range(0,len(row_imgs)):
block_imgs = row_imgs[j]
block_imgs = np.array(block_imgs)
results = cnn.predict(model, block_imgs, class_name)
print('recognize result:',results)
recognize result: ['1', '0', '12', '2', '10']
recognize result: ['8', '12', '6', '10']
recognize result: ['1', '0', '12', '7', '10']
recognize result: ['1', '0', '-', '2', '=']
recognize result: ['8', '-', '6', '=']
recognize result: ['1', '0', '-', '7', '=']
2.6 计算并反馈
# 计算数值并返回结果 参数chars:['8', '-', '6', '=']
def calculation(chars):
cstr = ''.join(chars)
result = ''
if("=" in cstr): # 有等号
str_arr = cstr.split('=')
c_str = str_arr[0]
r_str = str_arr[1]
c_str = c_str.replace("×","*")
c_str = c_str.replace("÷","/")
try:
c_r = int(eval(c_str))
except Exception as e:
print("Exception",e)
if r_str == "":
result = c_r
else:
if str(c_r) == str(r_str):
result = "√"
else:
result = "×"
return result
recognize result: ['8', '×', '4', '=']
calculate result: 32
recognize result: ['2', '-', '1', '=', '1']
calculate result: √
recognize result: ['1', '0', '-', '5', '=']
calculate result: 5
2.6.2 反馈
# 绘制文本
def cv2ImgAddText(img, text, left, top, textColor=(255, 0, 0), textSize=20):
if (isinstance(img, np.ndarray)): # 判断是否OpenCV图片类型
img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
# 创建一个可以在给定图像上绘图的对象
draw = ImageDraw.Draw(img)
# 字体的格式
fontStyle = ImageFont.truetype("fonts/fangzheng_shusong.ttf", textSize, encoding="utf-8")
# 绘制文本
draw.text((left, top), text, textColor, font=fontStyle)
# 转换回OpenCV格式
return cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
# 获取切图标注,切图图片,原图图图片
divImg(path,save) =
# 恢复模型,用于图片识别
model = cnn.create_model()
model.load_weights('checkpoint/char_checkpoint')
class_name = np.load('class_name.npy')
# 遍历行
for i in range(0,len(all_char_imgs)):
row_imgs = all_char_imgs[i]
# 遍历块
for j in range(0,len(row_imgs)):
block_imgs = row_imgs[j]
block_imgs = np.array(block_imgs)
# 图片识别
results = cnn.predict(model, block_imgs, class_name)
result:',results)
# 计算结果
result = calculation(results)
result:',result)
# 获取块的标注坐标
block_mark = all_mark_boxs[i][j]
# 获取结果的坐标,写在块的最后一个字
answer_box = block_mark[-1]
# 计算最后一个字的位置
x = answer_box[2]
y = answer_box[3]
iw = answer_box[2] - answer_box[0]
ih = answer_box[3] - answer_box[1]
# 计算字体大小
textSize = max(iw,ih)
# 根据结果设置字体颜色
if str(result) == "√":
color = (0, 255, 0)
elif str(result) == "×":
color = (255, 0, 0)
else:
color = (192, 192,192)
# 将结果写到原图上
img_o = cv2ImgAddText(img_o, str(result), answer_box[2], answer_box[1],color, textSize)
# 将写满结果的原图保存
img_o)
# 注意
同级新建fonts文件夹里拷贝一些字体文件,从这里找C:\Windows\Fonts,几十个就行。 get_character_pic.py 生成字体 cnn.py 训练数据 main.py 裁剪指定图片并识别,素材图片新建imgs文件夹,在imgs/question.png下,结果文件保存在imgs/result.png。 注意如果识别不成功,很可能是question.png的字体你没有训练(这幅图的字体是方正书宋简体,但是你只训练了楷体),这时候可以使用楷体自己编一个算式图。
来自最美程序员公众号