1 year ago
#387585
Josue Marin
Python + OpenCV - Show Video Feed from other script on Tkinter GUI
I have written a Python Script with the sole intention of detecting text on the webcam video feed using cv2 and pytesseract.
This function is called webcamOCR, and it's in OCRDetection.py
.
For gui.py
I have designed the following mockup:
When I try to call webcamOCR from OCRDetection.py
all it does is to open the webcam feed in a separate window.
How can I make it to display the Video Feed just like I designed it? Any pointers would be great.
This is the code I've written so far:
OCRDetection.py:
import numpy as np
import argparse
import imutils
import time
import cv2
import pytesseract
from imutils.video import VideoStream
from imutils.video import FPS
from imutils.object_detection import non_max_suppression
##Route to pytesseract on MacOS.
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.1.0/bin/tesseract'
def box_extractor(scores, geometry, min_confidence):
num_rows, num_cols = scores.shape[2:4]
rectangles = []
confidences = []
for y in range(num_rows):
scores_data = scores[0, 0, y]
x_data0 = geometry[0, 0, y]
x_data1 = geometry[0, 1, y]
x_data2 = geometry[0, 2, y]
x_data3 = geometry[0, 3, y]
angles_data = geometry[0, 4, y]
for x in range(num_cols):
if scores_data[x] < min_confidence:
continue
offset_x, offset_y = x * 4.0, y * 4.0
angle = angles_data[x]
cos = np.cos(angle)
sin = np.sin(angle)
box_h = x_data0[x] + x_data2[x]
box_w = x_data1[x] + x_data3[x]
end_x = int(offset_x + (cos * x_data1[x]) + (sin * x_data2[x]))
end_y = int(offset_y + (cos * x_data2[x]) - (sin * x_data1[x]))
start_x = int(end_x - box_w)
start_y = int(end_y - box_h)
rectangles.append((start_x, start_y, end_x, end_y))
confidences.append(scores_data[x])
return rectangles, confidences
def get_arguments():
ap = argparse.ArgumentParser()
ap.add_argument('-v', '--video', type=str,
help='path to optional video file')
ap.add_argument('-east', '--east', type=str, required=True,
help='path to EAST text detection model')
ap.add_argument('-c', '--min_confidence', type=float, default=0.5,
help='minimum confidence to process a region')
ap.add_argument('-w', '--width', type=int, default=320,
help='resized image width (multiple of 32)')
ap.add_argument('-e', '--height', type=int, default=320,
help='resized image height (multiple of 32)')
ap.add_argument('-p', '--padding', type=float, default=0.0,
help='padding on each ROI border')
arguments = vars(ap.parse_args())
return arguments
def webcamOCR():
args = get_arguments()
w, h = None, None
new_w, new_h = args['width'], args['height']
ratio_w, ratio_h = None, None
layer_names = ['feature_fusion/Conv_7/Sigmoid', 'feature_fusion/concat_3']
print("Starting the OCR Engine...")
net = cv2.dnn.readNet(args["east"])
if not args.get('video', False):
print("Activating webcam...")
vs = VideoStream(src=0).start()
time.sleep(1)
print("Waiting to detect text...")
else:
vs = cv2.VideoCapture(args['video'])
fps = FPS().start()
while True:
frame = vs.read()
frame = frame[1] if args.get('video', False) else frame
if frame is None:
break
frame = imutils.resize(frame, width=1000)
orig = frame.copy()
orig_h, orig_w = orig.shape[:2]
if w is None or h is None:
h, w = frame.shape[:2]
ratio_w = w / float(new_w)
ratio_h = h / float(new_h)
frame = cv2.resize(frame, (new_w, new_h))
blob = cv2.dnn.blobFromImage(frame, 1.0, (new_w, new_h), (123.68, 116.78, 103.94),
swapRB=True, crop=False)
net.setInput(blob)
scores, geometry = net.forward(layer_names)
rectangles, confidences = box_extractor(scores, geometry, min_confidence=args['min_confidence'])
boxes = non_max_suppression(np.array(rectangles), probs=confidences)
for (start_x, start_y, end_x, end_y) in boxes:
start_x = int(start_x * ratio_w)
start_y = int(start_y * ratio_h)
end_x = int(end_x * ratio_w)
end_y = int(end_y * ratio_h)
dx = int((end_x - start_x) * args['padding'])
dy = int((end_y - start_y) * args['padding'])
start_x = max(0, start_x - dx)
start_y = max(0, start_y - dy)
end_x = min(orig_w, end_x + (dx * 2))
end_y = min(orig_h, end_y + (dy * 2))
ocRegion = orig[start_y:end_y, start_x:end_x]
# recognizing text
config = '-l eng --oem 1 --psm 6'
text = pytesseract.image_to_string(ocRegion, config=config)
if text != "":
if len(text)==8:
print(f"Detected text: {text}")
cv2.rectangle(orig, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)
cv2.putText(orig, text, (start_x, start_y - 20),
cv2.FONT_HERSHEY_COMPLEX, 1.2, (34, 226, 66), 3)
break
break
fps.update()
cv2.imshow("Real-Time OCR", orig)
key = cv2.waitKey(1) & 0xFF
if key == ord('q'):
break
fps.stop()
print(f"[INFO] elapsed time {round(fps.elapsed(), 2)}")
print(f"[INFO] approx. FPS : {round(fps.fps(), 2)}")
if not args.get('video', False):
vs.stop()
else:
vs.release()
cv2.destroyAllWindows()
and this is gui.py:
from tkinter import *
from tkinter import filedialog
from PIL import Image
from PIL import ImageTk
import cv2
import imutils
from OCRDetection import *
def Exit():
window.destroy()
def Start():
global cap
cap = cv2.VideoCapture(0,cv2.CAP_DSHOW)
webcamFeed()
def Stop():
lblVideo.image = ""
lblEmpty.configure(text="")
cap.release()
def webcamFeed():
webcamOCR()
def Exit():
global cap
cap.release()
cap = None
window = Tk()
window.title("Real Time OCR")
formTitle = Label(window, text="Real Time OCR",font=("Arial",24))
formTitle.grid(column=0,row=0,columnspan=1)
btn = Button(window,text="Start",font=("Arial",12),background="gray",fg="white", command=Start)
btn2 = Button(window,text="Detener",font=("Arial",12),background="orange",fg="white", command=Stop)
btn3 = Button(window,text="Exit",font=("Arial",12),background="red",fg="white",command=Exit)
lblVideo = Label(window)
lblVideo.grid(column=0,row=1,columnspan=2,pady=10)
btn.grid(column=0,row=2)
btn2.grid(column=1,row=2)
btn3.grid(column=2,row=2)
lblEmpty = Label(window,text="",width=20)
lblEmpty.grid(column=4,row=0)
window.mainloop()
Note: In order to run gui.py
I have to pass the arguments that OCRDetection.py
requires.
Example: `python3 OCRDetection.py -east frozen_east_text_detection.pb"
python
opencv
tkinter
tesseract
python-tesseract
0 Answers
Your Answer