marc nicole
2024-07-30 18:18:42 UTC
Hello all,
I want to predict an object by given as input an image and want to have my
model be able to predict the label. I have trained a model using tensorflow
based on annotated database where the target object to predict was added to
the pretrained model. the code I am using is the following where I set the
target object image as input and want to have the prediction output:
class MultiObjectDetection():
def __init__(self, classes_name):
self._classes_name = classes_name
self._num_classes = len(classes_name)
self._common_params = {'image_size': 448, 'num_classes':
self._num_classes,
'batch_size':1}
self._net_params = {'cell_size': 7, 'boxes_per_cell':2,
'weight_decay': 0.0005}
self._net = YoloTinyNet(self._common_params, self._net_params,
test=True)
def predict_object(self, image):
predicts = self._net.inference(image)
return predicts
def process_predicts(self, resized_img, predicts, thresh=0.2):
"""
process the predicts of object detection with one image input.
Args:
resized_img: resized source image.
predicts: output of the model.
thresh: thresh of bounding box confidence.
Return:
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
"""
cls_num = self._num_classes
bbx_per_cell = self._net_params["boxes_per_cell"]
cell_size = self._net_params["cell_size"]
img_size = self._common_params["image_size"]
p_classes = predicts[0, :, :, 0:cls_num]
C = predicts[0, :, :, cls_num:cls_num+bbx_per_cell] # two
bounding boxes in one cell.
coordinate = predicts[0, :, :, cls_num+bbx_per_cell:] # all
bounding boxes position.
p_classes = np.reshape(p_classes, (cell_size, cell_size, 1, cls_num))
C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1))
P = C * p_classes # confidencefor all classes of all bounding
boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2,
1).
predicts_dict = {}
for i in range(cell_size):
for j in range(cell_size):
temp_data = np.zeros_like(P, np.float32)
temp_data[i, j, :, :] = P[i, j, :, :]
position = np.argmax(temp_data) # refer to the class
num (with maximum confidence) for every bounding box.
index = np.unravel_index(position, P.shape)
if P[index] > thresh:
class_num = index[-1]
coordinate = np.reshape(coordinate, (cell_size,
cell_size, bbx_per_cell, 4)) # (cell_size, cell_size,
bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax]
max_coordinate = coordinate[index[0], index[1], index[2], :]
xcenter = max_coordinate[0]
ycenter = max_coordinate[1]
w = max_coordinate[2]
h = max_coordinate[3]
xcenter = (index[1] + xcenter) * (1.0*img_size /cell_size)
ycenter = (index[0] + ycenter) * (1.0*img_size /cell_size)
w = w * img_size
h = h * img_size
xmin = 0 if (xcenter - w/2.0 < 0) else (xcenter - w/2.0)
ymin = 0 if (xcenter - w/2.0 < 0) else (ycenter - h/2.0)
xmax = resized_img.shape[0] if (xmin + w) >
resized_img.shape[0] else (xmin + w)
ymax = resized_img.shape[1] if (ymin + h) >
resized_img.shape[1] else (ymin + h)
class_name = self._classes_name[class_num]
predicts_dict.setdefault(class_name, [])
predicts_dict[class_name].append([int(xmin),
int(ymin), int(xmax), int(ymax), P[index]])
return predicts_dict
def non_max_suppress(self, predicts_dict, threshold=0.5):
"""
implement non-maximum supression on predict bounding boxes.
Args:
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
threshhold: iou threshold
Return:
predicts_dict processed by non-maximum suppression
"""
for object_name, bbox in predicts_dict.items():
bbox_array = np.array(bbox, dtype=np.float)
x1, y1, x2, y2, scores = bbox_array[:,0], bbox_array[:,1],
bbox_array[:,2], bbox_array[:,3], bbox_array[:,4]
areas = (x2-x1+1) * (y2-y1+1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
inter = np.maximum(0.0, xx2-xx1+1) * np.maximum(0.0, yy2-yy1+1)
iou = inter/(areas[i]+areas[order[1:]]-inter)
indexs = np.where(iou<=threshold)[0]
order = order[indexs+1]
bbox = bbox_array[keep]
predicts_dict[object_name] = bbox.tolist()
predicts_dict = predicts_dict
return predicts_dict
class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor",
"small_ball"]
modelFile = ('models\\train\\model.ckpt-0')
track_object = "small_ball"print("object detection and tracking...")
multiObjectDetect = MultiObjectDetection(IP, class_names)
image = tf.placeholder(tf.float32, (1, 448, 448, 3))
object_predicts = multiObjectDetect.predict_object(image)
sess = tf.Session()
saver = tf.train.Saver(multiObjectDetect._net.trainable_collection)
saver.restore(sess, modelFile)
index = 0while 1:
src_img = cv2.imread("./weirdobject.jpg")
resized_img = cv2.resize(src_img, (448, 448))
np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
np_img = np_img.astype(np.float32)
np_img = np_img / 255.0 * 2 - 1
np_img = np.reshape(np_img, (1, 448, 448, 3))
np_predict = sess.run(object_predicts, feed_dict={image: np_img})
predicts_dict = multiObjectDetect.process_predicts(resized_img, np_predict)
predicts_dict = multiObjectDetect.non_max_suppress(predicts_dict)
print ("predict dict = ", predicts_dict)
The problem with this code is that the predicts_dict returns:
predict dict = {'sheep': [[233.0, 92.0, 448.0, -103.0,
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]], 'horse': [[374.0, 33.0,
282.0, 448.0, 5.277851581573486], [135.0, 688.0, -33.0, -14.0,
3.5144259929656982], [1.0, 117.0, 112.0, -138.0, 2.656987190246582]],
'bicycle': [[461.0, 781.0, 154.0, -381.0, 5.918102741241455], [70.0,
344.0, 391.0, -138.0, 3.031444787979126], [378.0, 497.0, 46.0, 149.0,
2.7629122734069824], [541.0, 583.0, 69.0, 307.0, 2.7170517444610596],
[323.0, 22.0, 336.0, 448.0, 1.608760952949524]], 'bottle': [[390.0,
218.0, -199.0, 448.0, 4.582971096038818], [0.0, 0.0, 448.0, -410.0,
0.9097045063972473]], 'sofa': [[346.0, 102.0, 323.0, -38.0,
2.371835947036743]], 'dog': [[319.0, 254.0, -282.0, 373.0,
4.022889137268066]], 'cat': [[63.0, -195.0, 365.0, -92.0,
3.5134828090667725]], 'person': [[22.0, -122.0, 154.0, 448.0,
3.927537441253662], [350.0, 155.0, -36.0, -445.0, 2.679833173751831],
[119.0, 416.0, -43.0, 292.0, 0.9529445171356201], [251.0, 445.0,
225.0, 188.0, 0.9001350402832031]], 'train': [[329.0, 485.0, -24.0,
-235.0, 2.7050414085388184], [483.0, 362.0, 237.0, -86.0,
2.555817127227783], [13.0, 365.0, 373.0, 448.0, 0.6229299902915955]],
'small_ball': [[217.0, 737.0, 448.0, -315.0, 1.739920973777771],
[117.0, 283.0, 153.0, 122.0, 1.5690066814422607]], 'boat': [[164.0,
805.0, 34.0, -169.0, 4.972668170928955], [0.0, 0.0, 397.0, 69.0,
2.353729486465454], [302.0, 605.0, 15.0, -22.0, 2.0259625911712646]],
'aeroplane': [[470.0, 616.0, -305.0, -37.0, 3.431873321533203], [0.0,
0.0, 448.0, -72.0, 2.836672306060791]], 'bus': [[0.0, 0.0, -101.0,
-280.0, 1.2078320980072021]], 'pottedplant': [[620.0, -268.0, -124.0,
418.0, 2.158564805984497], [0.0, 0.0, 448.0, -779.0,
1.6623022556304932]], 'tvmonitor': [[0.0, 0.0, 448.0, 85.0,
3.238999128341675], [240.0, 772.0, 200.0, 91.0, 1.7443398237228394],
[546.0, 155.0, 448.0, 448.0, 1.1334525346755981], [107.0, 441.0,
432.0, 219.0, 0.5971617698669434]], 'chair': [[470.0, -187.0, 106.0,
235.0, 3.8548083305358887], [524.0, 740.0, -103.0, 99.0,
3.636549234390259], [0.0, 0.0, 275.0, -325.0, 3.0997846126556396],
[711.0, -231.0, -146.0, 392.0, 2.205275535583496]], 'diningtable':
[[138.0, -310.0, 111.0, 448.0, 4.660728931427002], [317.0, -66.0,
313.0, 6.0, 4.535496234893799], [0.0, 0.0, -41.0, 175.0,
1.8571208715438843], [21.0, -92.0, 76.0, 172.0, 1.2035608291625977],
[0.0, 0.0, 448.0, -250.0, 1.00322687625885]], 'car': [[312.0, 232.0,
132.0, 309.0, 3.205225706100464], [514.0, -76.0, 218.0, 448.0,
1.4289973974227905], [0.0, 0.0, 448.0, 142.0, 0.7124998569488525]]}
WHile I expect only the dict to contain the small_ball key
How's that is possible? where's the prediction output?How to fix the code?
I want to predict an object by given as input an image and want to have my
model be able to predict the label. I have trained a model using tensorflow
based on annotated database where the target object to predict was added to
the pretrained model. the code I am using is the following where I set the
target object image as input and want to have the prediction output:
class MultiObjectDetection():
def __init__(self, classes_name):
self._classes_name = classes_name
self._num_classes = len(classes_name)
self._common_params = {'image_size': 448, 'num_classes':
self._num_classes,
'batch_size':1}
self._net_params = {'cell_size': 7, 'boxes_per_cell':2,
'weight_decay': 0.0005}
self._net = YoloTinyNet(self._common_params, self._net_params,
test=True)
def predict_object(self, image):
predicts = self._net.inference(image)
return predicts
def process_predicts(self, resized_img, predicts, thresh=0.2):
"""
process the predicts of object detection with one image input.
Args:
resized_img: resized source image.
predicts: output of the model.
thresh: thresh of bounding box confidence.
Return:
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
"""
cls_num = self._num_classes
bbx_per_cell = self._net_params["boxes_per_cell"]
cell_size = self._net_params["cell_size"]
img_size = self._common_params["image_size"]
p_classes = predicts[0, :, :, 0:cls_num]
C = predicts[0, :, :, cls_num:cls_num+bbx_per_cell] # two
bounding boxes in one cell.
coordinate = predicts[0, :, :, cls_num+bbx_per_cell:] # all
bounding boxes position.
p_classes = np.reshape(p_classes, (cell_size, cell_size, 1, cls_num))
C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1))
P = C * p_classes # confidencefor all classes of all bounding
boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2,
1).
predicts_dict = {}
for i in range(cell_size):
for j in range(cell_size):
temp_data = np.zeros_like(P, np.float32)
temp_data[i, j, :, :] = P[i, j, :, :]
position = np.argmax(temp_data) # refer to the class
num (with maximum confidence) for every bounding box.
index = np.unravel_index(position, P.shape)
if P[index] > thresh:
class_num = index[-1]
coordinate = np.reshape(coordinate, (cell_size,
cell_size, bbx_per_cell, 4)) # (cell_size, cell_size,
bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax]
max_coordinate = coordinate[index[0], index[1], index[2], :]
xcenter = max_coordinate[0]
ycenter = max_coordinate[1]
w = max_coordinate[2]
h = max_coordinate[3]
xcenter = (index[1] + xcenter) * (1.0*img_size /cell_size)
ycenter = (index[0] + ycenter) * (1.0*img_size /cell_size)
w = w * img_size
h = h * img_size
xmin = 0 if (xcenter - w/2.0 < 0) else (xcenter - w/2.0)
ymin = 0 if (xcenter - w/2.0 < 0) else (ycenter - h/2.0)
xmax = resized_img.shape[0] if (xmin + w) >
resized_img.shape[0] else (xmin + w)
ymax = resized_img.shape[1] if (ymin + h) >
resized_img.shape[1] else (ymin + h)
class_name = self._classes_name[class_num]
predicts_dict.setdefault(class_name, [])
predicts_dict[class_name].append([int(xmin),
int(ymin), int(xmax), int(ymax), P[index]])
return predicts_dict
def non_max_suppress(self, predicts_dict, threshold=0.5):
"""
implement non-maximum supression on predict bounding boxes.
Args:
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
threshhold: iou threshold
Return:
predicts_dict processed by non-maximum suppression
"""
for object_name, bbox in predicts_dict.items():
bbox_array = np.array(bbox, dtype=np.float)
x1, y1, x2, y2, scores = bbox_array[:,0], bbox_array[:,1],
bbox_array[:,2], bbox_array[:,3], bbox_array[:,4]
areas = (x2-x1+1) * (y2-y1+1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
inter = np.maximum(0.0, xx2-xx1+1) * np.maximum(0.0, yy2-yy1+1)
iou = inter/(areas[i]+areas[order[1:]]-inter)
indexs = np.where(iou<=threshold)[0]
order = order[indexs+1]
bbox = bbox_array[keep]
predicts_dict[object_name] = bbox.tolist()
predicts_dict = predicts_dict
return predicts_dict
class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor",
"small_ball"]
modelFile = ('models\\train\\model.ckpt-0')
track_object = "small_ball"print("object detection and tracking...")
multiObjectDetect = MultiObjectDetection(IP, class_names)
image = tf.placeholder(tf.float32, (1, 448, 448, 3))
object_predicts = multiObjectDetect.predict_object(image)
sess = tf.Session()
saver = tf.train.Saver(multiObjectDetect._net.trainable_collection)
saver.restore(sess, modelFile)
index = 0while 1:
src_img = cv2.imread("./weirdobject.jpg")
resized_img = cv2.resize(src_img, (448, 448))
np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
np_img = np_img.astype(np.float32)
np_img = np_img / 255.0 * 2 - 1
np_img = np.reshape(np_img, (1, 448, 448, 3))
np_predict = sess.run(object_predicts, feed_dict={image: np_img})
predicts_dict = multiObjectDetect.process_predicts(resized_img, np_predict)
predicts_dict = multiObjectDetect.non_max_suppress(predicts_dict)
print ("predict dict = ", predicts_dict)
The problem with this code is that the predicts_dict returns:
predict dict = {'sheep': [[233.0, 92.0, 448.0, -103.0,
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]], 'horse': [[374.0, 33.0,
282.0, 448.0, 5.277851581573486], [135.0, 688.0, -33.0, -14.0,
3.5144259929656982], [1.0, 117.0, 112.0, -138.0, 2.656987190246582]],
'bicycle': [[461.0, 781.0, 154.0, -381.0, 5.918102741241455], [70.0,
344.0, 391.0, -138.0, 3.031444787979126], [378.0, 497.0, 46.0, 149.0,
2.7629122734069824], [541.0, 583.0, 69.0, 307.0, 2.7170517444610596],
[323.0, 22.0, 336.0, 448.0, 1.608760952949524]], 'bottle': [[390.0,
218.0, -199.0, 448.0, 4.582971096038818], [0.0, 0.0, 448.0, -410.0,
0.9097045063972473]], 'sofa': [[346.0, 102.0, 323.0, -38.0,
2.371835947036743]], 'dog': [[319.0, 254.0, -282.0, 373.0,
4.022889137268066]], 'cat': [[63.0, -195.0, 365.0, -92.0,
3.5134828090667725]], 'person': [[22.0, -122.0, 154.0, 448.0,
3.927537441253662], [350.0, 155.0, -36.0, -445.0, 2.679833173751831],
[119.0, 416.0, -43.0, 292.0, 0.9529445171356201], [251.0, 445.0,
225.0, 188.0, 0.9001350402832031]], 'train': [[329.0, 485.0, -24.0,
-235.0, 2.7050414085388184], [483.0, 362.0, 237.0, -86.0,
2.555817127227783], [13.0, 365.0, 373.0, 448.0, 0.6229299902915955]],
'small_ball': [[217.0, 737.0, 448.0, -315.0, 1.739920973777771],
[117.0, 283.0, 153.0, 122.0, 1.5690066814422607]], 'boat': [[164.0,
805.0, 34.0, -169.0, 4.972668170928955], [0.0, 0.0, 397.0, 69.0,
2.353729486465454], [302.0, 605.0, 15.0, -22.0, 2.0259625911712646]],
'aeroplane': [[470.0, 616.0, -305.0, -37.0, 3.431873321533203], [0.0,
0.0, 448.0, -72.0, 2.836672306060791]], 'bus': [[0.0, 0.0, -101.0,
-280.0, 1.2078320980072021]], 'pottedplant': [[620.0, -268.0, -124.0,
418.0, 2.158564805984497], [0.0, 0.0, 448.0, -779.0,
1.6623022556304932]], 'tvmonitor': [[0.0, 0.0, 448.0, 85.0,
3.238999128341675], [240.0, 772.0, 200.0, 91.0, 1.7443398237228394],
[546.0, 155.0, 448.0, 448.0, 1.1334525346755981], [107.0, 441.0,
432.0, 219.0, 0.5971617698669434]], 'chair': [[470.0, -187.0, 106.0,
235.0, 3.8548083305358887], [524.0, 740.0, -103.0, 99.0,
3.636549234390259], [0.0, 0.0, 275.0, -325.0, 3.0997846126556396],
[711.0, -231.0, -146.0, 392.0, 2.205275535583496]], 'diningtable':
[[138.0, -310.0, 111.0, 448.0, 4.660728931427002], [317.0, -66.0,
313.0, 6.0, 4.535496234893799], [0.0, 0.0, -41.0, 175.0,
1.8571208715438843], [21.0, -92.0, 76.0, 172.0, 1.2035608291625977],
[0.0, 0.0, 448.0, -250.0, 1.00322687625885]], 'car': [[312.0, 232.0,
132.0, 309.0, 3.205225706100464], [514.0, -76.0, 218.0, 448.0,
1.4289973974227905], [0.0, 0.0, 448.0, 142.0, 0.7124998569488525]]}
WHile I expect only the dict to contain the small_ball key
How's that is possible? where's the prediction output?How to fix the code?