-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfaceDetector.py
156 lines (142 loc) · 9.07 KB
/
faceDetector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import math
import cv2
import mediapipe as mp
import time
from collections import deque
mediaPipeDraw = mp.solutions.drawing_utils
mediaPipeFaceMesh = mp.solutions.face_mesh
class Const:
MILLISECONDS_IN_ONE_SECOND = 1000
class Landmark:
def __init__(self, timestamp) -> None:
self.timestamp = timestamp
self.points = {} #{pointCode1: [x, y, z], pointCode2: [x, y, z]}
self.lipSeparation = 0 #normalized distance the upper lip is from the lower lip (normalized, meaning the top of the head is considered 0% and tip of chin is considered 100%. Lip distances are normalized to between these two points before distances are calculated)
self.speaking = True
def setPoint(self, pointCode, x, y, z):
self.points[pointCode] = [x, y, z]
def storeLipSeparation(self, normalizedAbsoluteDistance):
self.lipSeparation = normalizedAbsoluteDistance
class VideoFaceProcessor:
def __init__(self, videoSource, displayMesh=False) -> None:
self.drawSettings = mediaPipeDraw.DrawingSpec(thickness=1, circle_radius=1)
self.videoSource = videoSource
self.displayMesh = displayMesh
self.minimumDetectionConfidence = 0.5
self.minimumTrackingConfidence = 0.5
self.topOfHead = 10
self.tipOfChin = 152
self.upperLipPoints = [82, 13, 312] #these are the canonical face model points that will be used to compare distance with the corresponding lower lip points
self.lowerLipPoints = [87, 14, 317] #these points will be compared with the upper lip points (these point ID's are available here https://github.com/google/mediapipe/issues/1615)
self.faces = {} #{faceID: deque[Landmark instance1, Landmark instance2, ...]}
self.hardCodedFaceID = "face1" #TODO: face detection needs to be more generic before being able to assign a faceID to the self.faces dict
self.fps = None
self.fontDisplayPosition = (10, 30)
self.fontScale = 0.5
self.fontColor = (0, 255, 0)
self.fontThickness = 2
self.pauseDuration = None
def run(self):
videoHandle = cv2.VideoCapture(self.videoSource)
self.fps = videoHandle.get(cv2.CAP_PROP_FPS)
self.pauseDuration = int((7/30) * self.fps) #because during an experiment it was seen that in a 30FPS video, if 7 consecutive frames were with mouth shut, it could be a pause
print(f"Video has {self.fps}FPS and pause detection duration = {self.pauseDuration} frames Processing...")
frameNumber = 0
with mediaPipeFaceMesh.FaceMesh(min_detection_confidence=self.minimumDetectionConfidence, min_tracking_confidence=self.minimumTrackingConfidence) as detectedMesh:
while videoHandle.isOpened():#as long as there are frames
frameExists, theImage = videoHandle.read()
if not frameExists:#reached end of video
break #for a stream, you'd use `continue` here
#---preprocess
theImage = cv2.cvtColor(theImage, cv2.COLOR_BGR2RGB)
theImage.flags.writeable = False #a performance improvement (optional)
processedImage = detectedMesh.process(theImage)
#---Extract desired points
theImage.flags.writeable = True
theImage = cv2.cvtColor(theImage, cv2.COLOR_RGB2BGR)
if self.displayMesh:
self.displayVideo(theImage, f'FPS: {int(self.fps)}')
timestamp = videoHandle.get(cv2.CAP_PROP_POS_MSEC) / Const.MILLISECONDS_IN_ONE_SECOND
print(f"Frame {frameNumber}, timestamp {timestamp}")
if processedImage.multi_face_landmarks:
for detectedFace in processedImage.multi_face_landmarks:
if self.hardCodedFaceID not in self.faces:#face not present in dict
self.faces[self.hardCodedFaceID] = deque() #add new face
if self.displayMesh:
mediaPipeDraw.draw_landmarks(image=theImage, landmark_list=detectedFace, connections=mediaPipeFaceMesh.FACEMESH_CONTOURS, landmark_drawing_spec=self.drawSettings, connection_drawing_spec=self.drawSettings)
pointIterator = 0
landmarkObject = Landmark(timestamp)
for pointOnFace in detectedFace.landmark:
if pointIterator in self.upperLipPoints or pointIterator in self.lowerLipPoints or pointIterator == self.topOfHead or pointIterator == self.tipOfChin:
landmarkObject.setPoint(pointIterator, pointOnFace.x, pointOnFace.y, pointOnFace.z)
pointIterator = pointIterator + 1
self.faces[self.hardCodedFaceID].append(landmarkObject)
frameNumber = frameNumber + 1
videoHandle.release
#print(f"Finished processing. {len(self.faces[self.hardCodedFaceID])} landmark objects added for face {self.hardCodedFaceID}")
self.calculateLipMovement()
def displayVideo(self, image, textToDisplay):
cv2.putText(image, textToDisplay, self.fontDisplayPosition, cv2.FONT_HERSHEY_DUPLEX, self.fontScale, self.fontColor, self.fontThickness)
cv2.imshow('Lip movement detection', image)
cv2.waitKey(1)
def displayPoints(self):
print("Detected points:")
for faceID, landmarkDeque in self.faces.items():
#print(f"{len(landmarkDeque)} items are present in landmarkDeque")
for landmark in landmarkDeque:
points = []
for pointCode, listOfXYZPoints in landmark.points.items():
points.append(f"{pointCode}:{listOfXYZPoints}")
#print(f"timestamp:{landmark.timestamp}: {points}")
def calculateLipMovement(self):
#---calculate average lip separation distance for all frames
for faceID, landmarkDeque in self.faces.items():
#print(f"{len(landmarkDeque)} landmarks")
prevLipSeparation = 0
for landmark in landmarkDeque:#landmark is a landmarkObject
faceHeight = abs(math.dist(landmark.points[self.topOfHead], landmark.points[self.tipOfChin]))
#--calculate distances between opposing points on upper and lower lips
averageDistance = self.calculateAverageLipOpenDistance(landmark.points)
#---normalize distances based on face height
normalizedLipSeparation = averageDistance * 100 / faceHeight
landmark.storeLipSeparation(normalizedLipSeparation)
t = float("{:.2f}".format(landmark.timestamp)); d = float("{:.2f}".format(abs(landmark.lipSeparation - prevLipSeparation))); s = float("{:.2f}".format(landmark.lipSeparation));
print(f"Time{t} diff:{d} Sep:{s}")
prevLipSeparation = landmark.lipSeparation
self.determineSilencePhases(landmarkDeque)
def getDetectedSilences(self):
return self.faces[self.hardCodedFaceID] #returns the deque of landmark objects
def determineSilencePhases(self, landmarkDeque):
#---use a sliding window to determine if the person is speaking (assuming 4 syllables per second https://en.wikipedia.org/wiki/Speech_tempo)
pastFew = deque(maxlen=self.pauseDuration)
mouthOpeningThreshold = 1 #distance of lip separation
for i in range(0, len(landmarkDeque)):
pastFew.append(landmarkDeque[i].lipSeparation)
if all(i < mouthOpeningThreshold for i in pastFew):#mouth has been closed for a few frames
for j in range(i, i-len(pastFew), -1):#go in reverse and mark those landmark objects as mouth closed
landmarkDeque[j].speaking = False
#self.showDetectedSilencePhases(landmarkDeque)
def calculateAverageLipOpenDistance(self, points):
totalDistance = 0
for i in range(len(self.upperLipPoints)):
totalDistance += math.dist(points[self.upperLipPoints[i]], points[self.lowerLipPoints[i]])
return totalDistance / len(self.upperLipPoints)
def showDetectedSilencePhases(self, landmarkDeque):
for landmark in landmarkDeque:
print(f"Time:{landmark.timestamp} speaking:{landmark.speaking}")
videoHandle = cv2.VideoCapture(self.videoSource)
frameNumber = 0
while videoHandle.isOpened():#as long as there are frames
frameExists, theImage = videoHandle.read()
if not frameExists:#reached end of video
break #for a stream, you'd use `continue` here
#---preprocess
theImage.flags.writeable = False #a performance improvement (optional)
textToDisplay = "?"
if landmarkDeque[frameNumber].speaking:
textToDisplay = "Speaking"
if not landmarkDeque[frameNumber].speaking:
textToDisplay = "Silence"
self.displayVideo(theImage, textToDisplay)
time.sleep(1/self.fps)
frameNumber = frameNumber + 1