Predicting an object over an pretrained model is not working
marc nicole
2024-07-30 18:18:42 UTC
Hello all,

I want to predict an object by given as input an image and want to have my
model be able to predict the label. I have trained a model using tensorflow
based on annotated database where the target object to predict was added to
the pretrained model. the code I am using is the following where I set the
target object image as input and want to have the prediction output:

class MultiObjectDetection():

def __init__(self, classes_name):

self._classes_name = classes_name
self._num_classes = len(classes_name)

self._common_params = {'image_size': 448, 'num_classes':
self._net_params = {'cell_size': 7, 'boxes_per_cell':2,
'weight_decay': 0.0005}
self._net = YoloTinyNet(self._common_params, self._net_params,

def predict_object(self, image):
predicts = self._net.inference(image)
return predicts

def process_predicts(self, resized_img, predicts, thresh=0.2):
process the predicts of object detection with one image input.

resized_img: resized source image.
predicts: output of the model.
thresh: thresh of bounding box confidence.
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
cls_num = self._num_classes
bbx_per_cell = self._net_params["boxes_per_cell"]
cell_size = self._net_params["cell_size"]
img_size = self._common_params["image_size"]
p_classes = predicts[0, :, :, 0:cls_num]
C = predicts[0, :, :, cls_num:cls_num+bbx_per_cell] # two
bounding boxes in one cell.
coordinate = predicts[0, :, :, cls_num+bbx_per_cell:] # all
bounding boxes position.

p_classes = np.reshape(p_classes, (cell_size, cell_size, 1, cls_num))
C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1))

P = C * p_classes # confidencefor all classes of all bounding
boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2,

predicts_dict = {}
for i in range(cell_size):
for j in range(cell_size):
temp_data = np.zeros_like(P, np.float32)
temp_data[i, j, :, :] = P[i, j, :, :]
position = np.argmax(temp_data) # refer to the class
num (with maximum confidence) for every bounding box.
index = np.unravel_index(position, P.shape)

if P[index] > thresh:
class_num = index[-1]
coordinate = np.reshape(coordinate, (cell_size,
cell_size, bbx_per_cell, 4)) # (cell_size, cell_size,
bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax]
max_coordinate = coordinate[index[0], index[1], index[2], :]

xcenter = max_coordinate[0]
ycenter = max_coordinate[1]
w = max_coordinate[2]
h = max_coordinate[3]

xcenter = (index[1] + xcenter) * (1.0*img_size /cell_size)
ycenter = (index[0] + ycenter) * (1.0*img_size /cell_size)

w = w * img_size
h = h * img_size
xmin = 0 if (xcenter - w/2.0 < 0) else (xcenter - w/2.0)
ymin = 0 if (xcenter - w/2.0 < 0) else (ycenter - h/2.0)
xmax = resized_img.shape[0] if (xmin + w) >
resized_img.shape[0] else (xmin + w)
ymax = resized_img.shape[1] if (ymin + h) >
resized_img.shape[1] else (ymin + h)

class_name = self._classes_name[class_num]
predicts_dict.setdefault(class_name, [])
int(ymin), int(xmax), int(ymax), P[index]])

return predicts_dict

def non_max_suppress(self, predicts_dict, threshold=0.5):
implement non-maximum supression on predict bounding boxes.
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
threshhold: iou threshold
predicts_dict processed by non-maximum suppression
for object_name, bbox in predicts_dict.items():
bbox_array = np.array(bbox, dtype=np.float)
x1, y1, x2, y2, scores = bbox_array[:,0], bbox_array[:,1],
bbox_array[:,2], bbox_array[:,3], bbox_array[:,4]
areas = (x2-x1+1) * (y2-y1+1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
inter = np.maximum(0.0, xx2-xx1+1) * np.maximum(0.0, yy2-yy1+1)
iou = inter/(areas[i]+areas[order[1:]]-inter)
indexs = np.where(iou<=threshold)[0]
order = order[indexs+1]
bbox = bbox_array[keep]
predicts_dict[object_name] = bbox.tolist()
predicts_dict = predicts_dict
return predicts_dict

class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor",
modelFile = ('models\\train\\model.ckpt-0')
track_object = "small_ball"print("object detection and tracking...")

multiObjectDetect = MultiObjectDetection(IP, class_names)
image = tf.placeholder(tf.float32, (1, 448, 448, 3))
object_predicts = multiObjectDetect.predict_object(image)

sess = tf.Session()
saver = tf.train.Saver(multiObjectDetect._net.trainable_collection)

saver.restore(sess, modelFile)

index = 0while 1:

src_img = cv2.imread("./weirdobject.jpg")
resized_img = cv2.resize(src_img, (448, 448))

np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
np_img = np_img.astype(np.float32)
np_img = np_img / 255.0 * 2 - 1
np_img = np.reshape(np_img, (1, 448, 448, 3))

np_predict = sess.run(object_predicts, feed_dict={image: np_img})
predicts_dict = multiObjectDetect.process_predicts(resized_img, np_predict)
predicts_dict = multiObjectDetect.non_max_suppress(predicts_dict)

print ("predict dict = ", predicts_dict)

The problem with this code is that the predicts_dict returns:

predict dict = {'sheep': [[233.0, 92.0, 448.0, -103.0,
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]], 'horse': [[374.0, 33.0,
282.0, 448.0, 5.277851581573486], [135.0, 688.0, -33.0, -14.0,
3.5144259929656982], [1.0, 117.0, 112.0, -138.0, 2.656987190246582]],
'bicycle': [[461.0, 781.0, 154.0, -381.0, 5.918102741241455], [70.0,
344.0, 391.0, -138.0, 3.031444787979126], [378.0, 497.0, 46.0, 149.0,
2.7629122734069824], [541.0, 583.0, 69.0, 307.0, 2.7170517444610596],
[323.0, 22.0, 336.0, 448.0, 1.608760952949524]], 'bottle': [[390.0,
218.0, -199.0, 448.0, 4.582971096038818], [0.0, 0.0, 448.0, -410.0,
0.9097045063972473]], 'sofa': [[346.0, 102.0, 323.0, -38.0,
2.371835947036743]], 'dog': [[319.0, 254.0, -282.0, 373.0,
4.022889137268066]], 'cat': [[63.0, -195.0, 365.0, -92.0,
3.5134828090667725]], 'person': [[22.0, -122.0, 154.0, 448.0,
3.927537441253662], [350.0, 155.0, -36.0, -445.0, 2.679833173751831],
[119.0, 416.0, -43.0, 292.0, 0.9529445171356201], [251.0, 445.0,
225.0, 188.0, 0.9001350402832031]], 'train': [[329.0, 485.0, -24.0,
-235.0, 2.7050414085388184], [483.0, 362.0, 237.0, -86.0,
2.555817127227783], [13.0, 365.0, 373.0, 448.0, 0.6229299902915955]],
'small_ball': [[217.0, 737.0, 448.0, -315.0, 1.739920973777771],
[117.0, 283.0, 153.0, 122.0, 1.5690066814422607]], 'boat': [[164.0,
805.0, 34.0, -169.0, 4.972668170928955], [0.0, 0.0, 397.0, 69.0,
2.353729486465454], [302.0, 605.0, 15.0, -22.0, 2.0259625911712646]],
'aeroplane': [[470.0, 616.0, -305.0, -37.0, 3.431873321533203], [0.0,
0.0, 448.0, -72.0, 2.836672306060791]], 'bus': [[0.0, 0.0, -101.0,
-280.0, 1.2078320980072021]], 'pottedplant': [[620.0, -268.0, -124.0,
418.0, 2.158564805984497], [0.0, 0.0, 448.0, -779.0,
1.6623022556304932]], 'tvmonitor': [[0.0, 0.0, 448.0, 85.0,
3.238999128341675], [240.0, 772.0, 200.0, 91.0, 1.7443398237228394],
[546.0, 155.0, 448.0, 448.0, 1.1334525346755981], [107.0, 441.0,
432.0, 219.0, 0.5971617698669434]], 'chair': [[470.0, -187.0, 106.0,
235.0, 3.8548083305358887], [524.0, 740.0, -103.0, 99.0,
3.636549234390259], [0.0, 0.0, 275.0, -325.0, 3.0997846126556396],
[711.0, -231.0, -146.0, 392.0, 2.205275535583496]], 'diningtable':
[[138.0, -310.0, 111.0, 448.0, 4.660728931427002], [317.0, -66.0,
313.0, 6.0, 4.535496234893799], [0.0, 0.0, -41.0, 175.0,
1.8571208715438843], [21.0, -92.0, 76.0, 172.0, 1.2035608291625977],
[0.0, 0.0, 448.0, -250.0, 1.00322687625885]], 'car': [[312.0, 232.0,
132.0, 309.0, 3.205225706100464], [514.0, -76.0, 218.0, 448.0,
1.4289973974227905], [0.0, 0.0, 448.0, 142.0, 0.7124998569488525]]}

WHile I expect only the dict to contain the small_ball key

How's that is possible? where's the prediction output?How to fix the code?
Thomas Passin
2024-07-30 19:25:39 UTC
Hello all,
I want to predict an object by given as input an image and want to have my
model be able to predict the label. I have trained a model using tensorflow
based on annotated database where the target object to predict was added to
the pretrained model. the code I am using is the following where I set the
self._classes_name = classes_name
self._num_classes = len(classes_name)
self._net_params = {'cell_size': 7, 'boxes_per_cell':2,
'weight_decay': 0.0005}
self._net = YoloTinyNet(self._common_params, self._net_params,
predicts = self._net.inference(image)
return predicts
process the predicts of object detection with one image input.
resized_img: resized source image.
predicts: output of the model.
thresh: thresh of bounding box confidence.
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
cls_num = self._num_classes
bbx_per_cell = self._net_params["boxes_per_cell"]
cell_size = self._net_params["cell_size"]
img_size = self._common_params["image_size"]
p_classes = predicts[0, :, :, 0:cls_num]
C = predicts[0, :, :, cls_num:cls_num+bbx_per_cell] # two
bounding boxes in one cell.
coordinate = predicts[0, :, :, cls_num+bbx_per_cell:] # all
bounding boxes position.
p_classes = np.reshape(p_classes, (cell_size, cell_size, 1, cls_num))
C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1))
P = C * p_classes # confidencefor all classes of all bounding
boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2,
predicts_dict = {}
temp_data = np.zeros_like(P, np.float32)
temp_data[i, j, :, :] = P[i, j, :, :]
position = np.argmax(temp_data) # refer to the class
num (with maximum confidence) for every bounding box.
index = np.unravel_index(position, P.shape)
class_num = index[-1]
coordinate = np.reshape(coordinate, (cell_size,
cell_size, bbx_per_cell, 4)) # (cell_size, cell_size,
bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax]
max_coordinate = coordinate[index[0], index[1], index[2], :]
xcenter = max_coordinate[0]
ycenter = max_coordinate[1]
w = max_coordinate[2]
h = max_coordinate[3]
xcenter = (index[1] + xcenter) * (1.0*img_size /cell_size)
ycenter = (index[0] + ycenter) * (1.0*img_size /cell_size)
w = w * img_size
h = h * img_size
xmin = 0 if (xcenter - w/2.0 < 0) else (xcenter - w/2.0)
ymin = 0 if (xcenter - w/2.0 < 0) else (ycenter - h/2.0)
xmax = resized_img.shape[0] if (xmin + w) >
resized_img.shape[0] else (xmin + w)
ymax = resized_img.shape[1] if (ymin + h) >
resized_img.shape[1] else (ymin + h)
class_name = self._classes_name[class_num]
predicts_dict.setdefault(class_name, [])
int(ymin), int(xmax), int(ymax), P[index]])
return predicts_dict
implement non-maximum supression on predict bounding boxes.
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
threshhold: iou threshold
predicts_dict processed by non-maximum suppression
bbox_array = np.array(bbox, dtype=np.float)
x1, y1, x2, y2, scores = bbox_array[:,0], bbox_array[:,1],
bbox_array[:,2], bbox_array[:,3], bbox_array[:,4]
areas = (x2-x1+1) * (y2-y1+1)
order = scores.argsort()[::-1]
keep = []
i = order[0]
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
inter = np.maximum(0.0, xx2-xx1+1) * np.maximum(0.0, yy2-yy1+1)
iou = inter/(areas[i]+areas[order[1:]]-inter)
indexs = np.where(iou<=threshold)[0]
order = order[indexs+1]
bbox = bbox_array[keep]
predicts_dict[object_name] = bbox.tolist()
predicts_dict = predicts_dict
return predicts_dict
class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor",
modelFile = ('models\\train\\model.ckpt-0')
track_object = "small_ball"print("object detection and tracking...")
multiObjectDetect = MultiObjectDetection(IP, class_names)
image = tf.placeholder(tf.float32, (1, 448, 448, 3))
object_predicts = multiObjectDetect.predict_object(image)
sess = tf.Session()
saver = tf.train.Saver(multiObjectDetect._net.trainable_collection)
saver.restore(sess, modelFile)
src_img = cv2.imread("./weirdobject.jpg")
resized_img = cv2.resize(src_img, (448, 448))
np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
np_img = np_img.astype(np.float32)
np_img = np_img / 255.0 * 2 - 1
np_img = np.reshape(np_img, (1, 448, 448, 3))
np_predict = sess.run(object_predicts, feed_dict={image: np_img})
predicts_dict = multiObjectDetect.process_predicts(resized_img, np_predict)
predicts_dict = multiObjectDetect.non_max_suppress(predicts_dict)
print ("predict dict = ", predicts_dict)
predict dict = {'sheep': [[233.0, 92.0, 448.0, -103.0,
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]], 'horse': [[374.0, 33.0,
282.0, 448.0, 5.277851581573486], [135.0, 688.0, -33.0, -14.0,
3.5144259929656982], [1.0, 117.0, 112.0, -138.0, 2.656987190246582]],
'bicycle': [[461.0, 781.0, 154.0, -381.0, 5.918102741241455], [70.0,
344.0, 391.0, -138.0, 3.031444787979126], [378.0, 497.0, 46.0, 149.0,
2.7629122734069824], [541.0, 583.0, 69.0, 307.0, 2.7170517444610596],
[323.0, 22.0, 336.0, 448.0, 1.608760952949524]], 'bottle': [[390.0,
218.0, -199.0, 448.0, 4.582971096038818], [0.0, 0.0, 448.0, -410.0,
0.9097045063972473]], 'sofa': [[346.0, 102.0, 323.0, -38.0,
2.371835947036743]], 'dog': [[319.0, 254.0, -282.0, 373.0,
4.022889137268066]], 'cat': [[63.0, -195.0, 365.0, -92.0,
3.5134828090667725]], 'person': [[22.0, -122.0, 154.0, 448.0,
3.927537441253662], [350.0, 155.0, -36.0, -445.0, 2.679833173751831],
[119.0, 416.0, -43.0, 292.0, 0.9529445171356201], [251.0, 445.0,
225.0, 188.0, 0.9001350402832031]], 'train': [[329.0, 485.0, -24.0,
-235.0, 2.7050414085388184], [483.0, 362.0, 237.0, -86.0,
2.555817127227783], [13.0, 365.0, 373.0, 448.0, 0.6229299902915955]],
'small_ball': [[217.0, 737.0, 448.0, -315.0, 1.739920973777771],
[117.0, 283.0, 153.0, 122.0, 1.5690066814422607]], 'boat': [[164.0,
805.0, 34.0, -169.0, 4.972668170928955], [0.0, 0.0, 397.0, 69.0,
2.353729486465454], [302.0, 605.0, 15.0, -22.0, 2.0259625911712646]],
'aeroplane': [[470.0, 616.0, -305.0, -37.0, 3.431873321533203], [0.0,
0.0, 448.0, -72.0, 2.836672306060791]], 'bus': [[0.0, 0.0, -101.0,
-280.0, 1.2078320980072021]], 'pottedplant': [[620.0, -268.0, -124.0,
418.0, 2.158564805984497], [0.0, 0.0, 448.0, -779.0,
1.6623022556304932]], 'tvmonitor': [[0.0, 0.0, 448.0, 85.0,
3.238999128341675], [240.0, 772.0, 200.0, 91.0, 1.7443398237228394],
[546.0, 155.0, 448.0, 448.0, 1.1334525346755981], [107.0, 441.0,
432.0, 219.0, 0.5971617698669434]], 'chair': [[470.0, -187.0, 106.0,
235.0, 3.8548083305358887], [524.0, 740.0, -103.0, 99.0,
3.636549234390259], [0.0, 0.0, 275.0, -325.0, 3.0997846126556396],
[[138.0, -310.0, 111.0, 448.0, 4.660728931427002], [317.0, -66.0,
313.0, 6.0, 4.535496234893799], [0.0, 0.0, -41.0, 175.0,
1.8571208715438843], [21.0, -92.0, 76.0, 172.0, 1.2035608291625977],
[0.0, 0.0, 448.0, -250.0, 1.00322687625885]], 'car': [[312.0, 232.0,
132.0, 309.0, 3.205225706100464], [514.0, -76.0, 218.0, 448.0,
1.4289973974227905], [0.0, 0.0, 448.0, 142.0, 0.7124998569488525]]}
WHile I expect only the dict to contain the small_ball key
How's that is possible? where's the prediction output?How to fix the code?
Without trying to figure out all that code, why would you expect only
results for a single key? An ML system is going to compute
probabilities and parameters for all objects it knows about (presumably
subject to some threshold).
marc nicole
2024-07-30 20:49:21 UTC
OK, but how's the probability of small_ball greater than others? I can't
find it anyway, what's its value?

Post by marc nicole
Post by marc nicole
Hello all,
I want to predict an object by given as input an image and want to have
Post by marc nicole
model be able to predict the label. I have trained a model using
Post by marc nicole
based on annotated database where the target object to predict was added
Post by marc nicole
the pretrained model. the code I am using is the following where I set
Post by marc nicole
self._classes_name = classes_name
self._num_classes = len(classes_name)
self._net_params = {'cell_size': 7, 'boxes_per_cell':2,
'weight_decay': 0.0005}
self._net = YoloTinyNet(self._common_params, self._net_params,
predicts = self._net.inference(image)
return predicts
process the predicts of object detection with one image input.
resized_img: resized source image.
predicts: output of the model.
thresh: thresh of bounding box confidence.
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1],
Post by marc nicole
cls_num = self._num_classes
bbx_per_cell = self._net_params["boxes_per_cell"]
cell_size = self._net_params["cell_size"]
img_size = self._common_params["image_size"]
p_classes = predicts[0, :, :, 0:cls_num]
C = predicts[0, :, :, cls_num:cls_num+bbx_per_cell] # two
bounding boxes in one cell.
coordinate = predicts[0, :, :, cls_num+bbx_per_cell:] # all
bounding boxes position.
p_classes = np.reshape(p_classes, (cell_size, cell_size, 1,
Post by marc nicole
C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1))
P = C * p_classes # confidencefor all classes of all bounding
boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2,
predicts_dict = {}
temp_data = np.zeros_like(P, np.float32)
temp_data[i, j, :, :] = P[i, j, :, :]
position = np.argmax(temp_data) # refer to the class
num (with maximum confidence) for every bounding box.
index = np.unravel_index(position, P.shape)
class_num = index[-1]
coordinate = np.reshape(coordinate, (cell_size,
cell_size, bbx_per_cell, 4)) # (cell_size, cell_size,
bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax]
max_coordinate = coordinate[index[0], index[1],
index[2], :]
Post by marc nicole
xcenter = max_coordinate[0]
ycenter = max_coordinate[1]
w = max_coordinate[2]
h = max_coordinate[3]
xcenter = (index[1] + xcenter) * (1.0*img_size
Post by marc nicole
ycenter = (index[0] + ycenter) * (1.0*img_size
Post by marc nicole
w = w * img_size
h = h * img_size
xmin = 0 if (xcenter - w/2.0 < 0) else (xcenter -
Post by marc nicole
ymin = 0 if (xcenter - w/2.0 < 0) else (ycenter -
Post by marc nicole
xmax = resized_img.shape[0] if (xmin + w) >
resized_img.shape[0] else (xmin + w)
ymax = resized_img.shape[1] if (ymin + h) >
resized_img.shape[1] else (ymin + h)
class_name = self._classes_name[class_num]
predicts_dict.setdefault(class_name, [])
int(ymin), int(xmax), int(ymax), P[index]])
return predicts_dict
implement non-maximum supression on predict bounding boxes.
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1],
Post by marc nicole
threshhold: iou threshold
predicts_dict processed by non-maximum suppression
bbox_array = np.array(bbox, dtype=np.float)
x1, y1, x2, y2, scores = bbox_array[:,0], bbox_array[:,1],
bbox_array[:,2], bbox_array[:,3], bbox_array[:,4]
areas = (x2-x1+1) * (y2-y1+1)
order = scores.argsort()[::-1]
keep = []
i = order[0]
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
inter = np.maximum(0.0, xx2-xx1+1) * np.maximum(0.0,
Post by marc nicole
iou = inter/(areas[i]+areas[order[1:]]-inter)
indexs = np.where(iou<=threshold)[0]
order = order[indexs+1]
bbox = bbox_array[keep]
predicts_dict[object_name] = bbox.tolist()
predicts_dict = predicts_dict
return predicts_dict
class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor",
modelFile = ('models\\train\\model.ckpt-0')
track_object = "small_ball"print("object detection and tracking...")
multiObjectDetect = MultiObjectDetection(IP, class_names)
image = tf.placeholder(tf.float32, (1, 448, 448, 3))
object_predicts = multiObjectDetect.predict_object(image)
sess = tf.Session()
saver = tf.train.Saver(multiObjectDetect._net.trainable_collection)
saver.restore(sess, modelFile)
src_img = cv2.imread("./weirdobject.jpg")
resized_img = cv2.resize(src_img, (448, 448))
np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
np_img = np_img.astype(np.float32)
np_img = np_img / 255.0 * 2 - 1
np_img = np.reshape(np_img, (1, 448, 448, 3))
np_predict = sess.run(object_predicts, feed_dict={image: np_img})
predicts_dict = multiObjectDetect.process_predicts(resized_img,
Post by marc nicole
predicts_dict = multiObjectDetect.non_max_suppress(predicts_dict)
print ("predict dict = ", predicts_dict)
predict dict = {'sheep': [[233.0, 92.0, 448.0, -103.0,
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]], 'horse': [[374.0, 33.0,
282.0, 448.0, 5.277851581573486], [135.0, 688.0, -33.0, -14.0,
3.5144259929656982], [1.0, 117.0, 112.0, -138.0, 2.656987190246582]],
'bicycle': [[461.0, 781.0, 154.0, -381.0, 5.918102741241455], [70.0,
344.0, 391.0, -138.0, 3.031444787979126], [378.0, 497.0, 46.0, 149.0,
2.7629122734069824], [541.0, 583.0, 69.0, 307.0, 2.7170517444610596],
[323.0, 22.0, 336.0, 448.0, 1.608760952949524]], 'bottle': [[390.0,
218.0, -199.0, 448.0, 4.582971096038818], [0.0, 0.0, 448.0, -410.0,
0.9097045063972473]], 'sofa': [[346.0, 102.0, 323.0, -38.0,
2.371835947036743]], 'dog': [[319.0, 254.0, -282.0, 373.0,
4.022889137268066]], 'cat': [[63.0, -195.0, 365.0, -92.0,
3.5134828090667725]], 'person': [[22.0, -122.0, 154.0, 448.0,
3.927537441253662], [350.0, 155.0, -36.0, -445.0, 2.679833173751831],
[119.0, 416.0, -43.0, 292.0, 0.9529445171356201], [251.0, 445.0,
225.0, 188.0, 0.9001350402832031]], 'train': [[329.0, 485.0, -24.0,
-235.0, 2.7050414085388184], [483.0, 362.0, 237.0, -86.0,
2.555817127227783], [13.0, 365.0, 373.0, 448.0, 0.6229299902915955]],
'small_ball': [[217.0, 737.0, 448.0, -315.0, 1.739920973777771],
[117.0, 283.0, 153.0, 122.0, 1.5690066814422607]], 'boat': [[164.0,
805.0, 34.0, -169.0, 4.972668170928955], [0.0, 0.0, 397.0, 69.0,
2.353729486465454], [302.0, 605.0, 15.0, -22.0, 2.0259625911712646]],
'aeroplane': [[470.0, 616.0, -305.0, -37.0, 3.431873321533203], [0.0,
0.0, 448.0, -72.0, 2.836672306060791]], 'bus': [[0.0, 0.0, -101.0,
-280.0, 1.2078320980072021]], 'pottedplant': [[620.0, -268.0, -124.0,
418.0, 2.158564805984497], [0.0, 0.0, 448.0, -779.0,
1.6623022556304932]], 'tvmonitor': [[0.0, 0.0, 448.0, 85.0,
3.238999128341675], [240.0, 772.0, 200.0, 91.0, 1.7443398237228394],
[546.0, 155.0, 448.0, 448.0, 1.1334525346755981], [107.0, 441.0,
432.0, 219.0, 0.5971617698669434]], 'chair': [[470.0, -187.0, 106.0,
235.0, 3.8548083305358887], [524.0, 740.0, -103.0, 99.0,
3.636549234390259], [0.0, 0.0, 275.0, -325.0, 3.0997846126556396],
[[138.0, -310.0, 111.0, 448.0, 4.660728931427002], [317.0, -66.0,
313.0, 6.0, 4.535496234893799], [0.0, 0.0, -41.0, 175.0,
1.8571208715438843], [21.0, -92.0, 76.0, 172.0, 1.2035608291625977],
[0.0, 0.0, 448.0, -250.0, 1.00322687625885]], 'car': [[312.0, 232.0,
132.0, 309.0, 3.205225706100464], [514.0, -76.0, 218.0, 448.0,
1.4289973974227905], [0.0, 0.0, 448.0, 142.0, 0.7124998569488525]]}
WHile I expect only the dict to contain the small_ball key
How's that is possible? where's the prediction output?How to fix the
Without trying to figure out all that code, why would you expect only
results for a single key? An ML system is going to compute
probabilities and parameters for all objects it knows about (presumably
subject to some threshold).
Thomas Passin
2024-07-30 21:45:20 UTC
OK, but how's the probability of small_ball greater than others? I can't
find it anyway, what's its value?
It's your code. I wouldn't know. I suppose it's represented somewhere in
all those parameters. You need to understand what those function calls
are returning. It's documented somewhere, right?

And you really do need to know the probabilities of the competing images
because otherwise you won't know how confident you can be that the
identification is a strong one.
Hello all,
I want to predict an object by given as input an image and want
to have my
Post by marc nicole
model be able to predict the label. I have trained a model using
Post by marc nicole
based on annotated database where the target object to predict
was added to
Post by marc nicole
the pretrained model. the code I am using is the following where
I set the
Post by marc nicole
          self._classes_name = classes_name
          self._num_classes = len(classes_name)
          self._net_params = {'cell_size': 7, 'boxes_per_cell':2,
'weight_decay': 0.0005}
          self._net = YoloTinyNet(self._common_params,
Post by marc nicole
          predicts = self._net.inference(image)
          return predicts
          process the predicts of object detection with one image
Post by marc nicole
              resized_img: resized source image.
              predicts: output of the model.
              thresh: thresh of bounding box confidence.
              predicts_dict: {"stick": [[x1, y1, x2, y2, scores1],
Post by marc nicole
          cls_num = self._num_classes
          bbx_per_cell = self._net_params["boxes_per_cell"]
          cell_size = self._net_params["cell_size"]
          img_size = self._common_params["image_size"]
          p_classes = predicts[0, :, :, 0:cls_num]
          C = predicts[0, :, :, cls_num:cls_num+bbx_per_cell] # two
bounding boxes in one cell.
          coordinate = predicts[0, :, :, cls_num+bbx_per_cell:] # all
bounding boxes position.
          p_classes = np.reshape(p_classes, (cell_size, cell_size,
1, cls_num))
Post by marc nicole
          C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1))
          P = C * p_classes # confidencefor all classes of all
Post by marc nicole
boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2,
          predicts_dict = {}
                  temp_data = np.zeros_like(P, np.float32)
                  temp_data[i, j, :, :] = P[i, j, :, :]
                  position = np.argmax(temp_data) # refer to the class
num (with maximum confidence) for every bounding box.
                  index = np.unravel_index(position, P.shape)
                      class_num = index[-1]
                      coordinate = np.reshape(coordinate, (cell_size,
cell_size, bbx_per_cell, 4)) # (cell_size, cell_size,
bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax]
                      max_coordinate = coordinate[index[0],
index[1], index[2], :]
Post by marc nicole
                      xcenter = max_coordinate[0]
                      ycenter = max_coordinate[1]
                      w = max_coordinate[2]
                      h = max_coordinate[3]
                      xcenter = (index[1] + xcenter) *
(1.0*img_size /cell_size)
Post by marc nicole
                      ycenter = (index[0] + ycenter) *
(1.0*img_size /cell_size)
Post by marc nicole
                      w = w * img_size
                      h = h * img_size
                      xmin = 0 if (xcenter - w/2.0 < 0) else
(xcenter - w/2.0)
Post by marc nicole
                      ymin = 0 if (xcenter - w/2.0 < 0) else
(ycenter - h/2.0)
Post by marc nicole
                      xmax = resized_img.shape[0] if (xmin + w) >
resized_img.shape[0] else (xmin + w)
                      ymax = resized_img.shape[1] if (ymin + h) >
resized_img.shape[1] else (ymin + h)
                      class_name = self._classes_name[class_num]
                      predicts_dict.setdefault(class_name, [])
int(ymin), int(xmax), int(ymax), P[index]])
          return predicts_dict
          implement non-maximum supression on predict bounding boxes.
              predicts_dict: {"stick": [[x1, y1, x2, y2, scores1],
Post by marc nicole
              threshhold: iou threshold
              predicts_dict processed by non-maximum suppression
              bbox_array = np.array(bbox, dtype=np.float)
              x1, y1, x2, y2, scores = bbox_array[:,0],
Post by marc nicole
bbox_array[:,2], bbox_array[:,3], bbox_array[:,4]
              areas = (x2-x1+1) * (y2-y1+1)
              order = scores.argsort()[::-1]
              keep = []
                  i = order[0]
                  xx1 = np.maximum(x1[i], x1[order[1:]])
                  yy1 = np.maximum(y1[i], y1[order[1:]])
                  xx2 = np.minimum(x2[i], x2[order[1:]])
                  yy2 = np.minimum(y2[i], y2[order[1:]])
                  inter = np.maximum(0.0, xx2-xx1+1) *
np.maximum(0.0, yy2-yy1+1)
Post by marc nicole
                  iou = inter/(areas[i]+areas[order[1:]]-inter)
                  indexs = np.where(iou<=threshold)[0]
                  order = order[indexs+1]
              bbox = bbox_array[keep]
              predicts_dict[object_name] = bbox.tolist()
              predicts_dict = predicts_dict
          return predicts_dict
class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
                     "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor",
modelFile = ('models\\train\\model.ckpt-0')
track_object = "small_ball"print("object detection and tracking...")
multiObjectDetect = MultiObjectDetection(IP, class_names)
image = tf.placeholder(tf.float32, (1, 448, 448, 3))
object_predicts = multiObjectDetect.predict_object(image)
sess = tf.Session()
saver = tf.train.Saver(multiObjectDetect._net.trainable_collection)
saver.restore(sess, modelFile)
      src_img = cv2.imread("./weirdobject.jpg")
      resized_img = cv2.resize(src_img, (448, 448))
      np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
      np_img = np_img.astype(np.float32)
      np_img = np_img / 255.0 * 2 - 1
      np_img = np.reshape(np_img, (1, 448, 448, 3))
Post by marc nicole
      predicts_dict =
multiObjectDetect.process_predicts(resized_img, np_predict)
Post by marc nicole
      predicts_dict =
Post by marc nicole
      print ("predict dict = ", predicts_dict)
predict dict =  {'sheep': [[233.0, 92.0, 448.0, -103.0,
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]], 'horse': [[374.0, 33.0,
282.0, 448.0, 5.277851581573486], [135.0, 688.0, -33.0, -14.0,
3.5144259929656982], [1.0, 117.0, 112.0, -138.0, 2.656987190246582]],
'bicycle': [[461.0, 781.0, 154.0, -381.0, 5.918102741241455], [70.0,
344.0, 391.0, -138.0, 3.031444787979126], [378.0, 497.0, 46.0, 149.0,
2.7629122734069824], [541.0, 583.0, 69.0, 307.0, 2.7170517444610596],
[323.0, 22.0, 336.0, 448.0, 1.608760952949524]], 'bottle': [[390.0,
218.0, -199.0, 448.0, 4.582971096038818], [0.0, 0.0, 448.0, -410.0,
0.9097045063972473]], 'sofa': [[346.0, 102.0, 323.0, -38.0,
2.371835947036743]], 'dog': [[319.0, 254.0, -282.0, 373.0,
4.022889137268066]], 'cat': [[63.0, -195.0, 365.0, -92.0,
3.5134828090667725]], 'person': [[22.0, -122.0, 154.0, 448.0,
3.927537441253662], [350.0, 155.0, -36.0, -445.0, 2.679833173751831],
[119.0, 416.0, -43.0, 292.0, 0.9529445171356201], [251.0, 445.0,
225.0, 188.0, 0.9001350402832031]], 'train': [[329.0, 485.0, -24.0,
-235.0, 2.7050414085388184], [483.0, 362.0, 237.0, -86.0,
2.555817127227783], [13.0, 365.0, 373.0, 448.0, 0.6229299902915955]],
'small_ball': [[217.0, 737.0, 448.0, -315.0, 1.739920973777771],
[117.0, 283.0, 153.0, 122.0, 1.5690066814422607]], 'boat': [[164.0,
805.0, 34.0, -169.0, 4.972668170928955], [0.0, 0.0, 397.0, 69.0,
2.353729486465454], [302.0, 605.0, 15.0, -22.0, 2.0259625911712646]],
'aeroplane': [[470.0, 616.0, -305.0, -37.0, 3.431873321533203], [0.0,
0.0, 448.0, -72.0, 2.836672306060791]], 'bus': [[0.0, 0.0, -101.0,
-280.0, 1.2078320980072021]], 'pottedplant': [[620.0, -268.0, -124.0,
418.0, 2.158564805984497], [0.0, 0.0, 448.0, -779.0,
1.6623022556304932]], 'tvmonitor': [[0.0, 0.0, 448.0, 85.0,
3.238999128341675], [240.0, 772.0, 200.0, 91.0, 1.7443398237228394],
[546.0, 155.0, 448.0, 448.0, 1.1334525346755981], [107.0, 441.0,
432.0, 219.0, 0.5971617698669434]], 'chair': [[470.0, -187.0, 106.0,
235.0, 3.8548083305358887], [524.0, 740.0, -103.0, 99.0,
3.636549234390259], [0.0, 0.0, 275.0, -325.0, 3.0997846126556396],
[[138.0, -310.0, 111.0, 448.0, 4.660728931427002], [317.0, -66.0,
313.0, 6.0, 4.535496234893799], [0.0, 0.0, -41.0, 175.0,
1.8571208715438843], [21.0, -92.0, 76.0, 172.0, 1.2035608291625977],
[0.0, 0.0, 448.0, -250.0, 1.00322687625885]], 'car': [[312.0, 232.0,
132.0, 309.0, 3.205225706100464], [514.0, -76.0, 218.0, 448.0,
1.4289973974227905], [0.0, 0.0, 448.0, 142.0, 0.7124998569488525]]}
WHile I expect only the dict to contain the small_ball key
How's that is possible? where's the prediction output?How to fix
the code?
Without trying to figure out all that code, why would you expect only
results for a single key?  An ML system is going to compute
probabilities and parameters for all objects it knows about (presumably
subject to some threshold).
2024-07-30 22:16:29 UTC
Post by marc nicole
Hello all,
I want to predict an object by given as input an image and want to have my
model be able to predict the label. I have trained a model using tensorflow
based on annotated database where the target object to predict was added to
the pretrained model. the code I am using is the following where I set the
Post by marc nicole
WHile I expect only the dict to contain the small_ball key
How's that is possible? where's the prediction output?How to fix the code?
To save us lots of reading and study to be able to help you, please advise:

1 what are the meanings of all these numbers?
Post by marc nicole
'sheep': [[233.0, 92.0, 448.0, -103.0,
Post by marc nicole
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]]
2 (assuming it hasn't) why the dict has not been sorted into a
meaningful order

3 how can one tell that the image is more likely to be a sheep than a train?
marc nicole
2024-07-31 10:27:07 UTC
I suppose the meaning of those numbers comes from this line
predicts_dict[class_name].append([int(xmin), int(ymin), int(xmax), int(ymax),
P[index]]) as well as the yolo inference call. But i was expecting zeros
for all classes except smallball. Because the image only shows that, and
that a train and a sheep wont have any target position or any probability
whatsoever in the image weirdobject.jpg
Post by marc nicole
Post by marc nicole
Hello all,
I want to predict an object by given as input an image and want to have
Post by marc nicole
model be able to predict the label. I have trained a model using
Post by marc nicole
based on annotated database where the target object to predict was added
Post by marc nicole
the pretrained model. the code I am using is the following where I set
Post by marc nicole
WHile I expect only the dict to contain the small_ball key
How's that is possible? where's the prediction output?How to fix the
1 what are the meanings of all these numbers?
Post by marc nicole
'sheep': [[233.0, 92.0, 448.0, -103.0,
Post by marc nicole
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]]
2 (assuming it hasn't) why the dict has not been sorted into a
meaningful order
3 how can one tell that the image is more likely to be a sheep than a
Grant Edwards
2024-07-31 13:58:12 UTC
Post by marc nicole
I suppose the meaning of those numbers comes from this line
predicts_dict[class_name].append([int(xmin), int(ymin), int(xmax),
int(ymax), P[index]]) as well as the yolo inference call. But i was
expecting zeros for all classes except smallball.
That's not how machine learning and object recognition works.
Post by marc nicole
Because the image only shows that,
You know that. The machine doesn't.
Post by marc nicole
and that a train and a sheep wont have any target position or any
probability whatsoever in the image weirdobject.jpg
That depends on the training data and how the model works.

You should probably do some reading on neural networks, machine
learning, and pattern/object recognition. You appear to be trying to
use tools without understanding what they do or how they work.

marc nicole
2024-07-31 19:59:58 UTC
You invitation to read on machine is not helping, if you wanna enlighten us
on this specific case otherwise pls spare me such comments which i know

Post by Grant Edwards
Post by marc nicole
I suppose the meaning of those numbers comes from this line
predicts_dict[class_name].append([int(xmin), int(ymin), int(xmax),
int(ymax), P[index]]) as well as the yolo inference call. But i was
expecting zeros for all classes except smallball.
That's not how machine learning and object recognition works.
Post by marc nicole
Because the image only shows that,
You know that. The machine doesn't.
Post by marc nicole
and that a train and a sheep wont have any target position or any
probability whatsoever in the image weirdobject.jpg
That depends on the training data and how the model works.
You should probably do some reading on neural networks, machine
learning, and pattern/object recognition. You appear to be trying to
use tools without understanding what they do or how they work.