import cv2 import os, sys import time import numpy as np import axengine as ort import argparse def make_grid(nx=20, ny=20, i=0, strides=[8, 16, 32], anchors=[[31,28, 38,32, 60,83],[84,110, 133,118, 200,113]]): """Generates a mesh grid for anchor boxes""" # shape = 1, len(anchors[i]) // 2, ny, nx, 2 # grid shape y, x = np.arange(ny, dtype=np.int32), np.arange(nx, dtype=np.int32) yv, xv = np.meshgrid(y, x, indexing="ij") grid = np.stack((xv, yv), 2) grid = np.expand_dims(grid, axis=0).repeat(len(anchors[0]) // 2, axis=0) grid = np.expand_dims(grid, axis=0) - 0.5 #add grid offset, i.e. y = 2.0 * x - 0.5 # anchor_grid = np.array([anchor*strides[i] for anchor in anchors[i]]).reshape((1, len(anchors[0]) // 2, 1, 1, 2)) anchor_grid = np.array(anchors[i]).reshape((1, len(anchors[0]) // 2, 1, 1, 2)) anchor_grid = anchor_grid.repeat(ny, axis=2).repeat(nx, axis=3) # print(anchor_grid.shape, shape) return grid, anchor_grid def sigmoid(x): return 1 / (1 + np.exp(-x)) def xywh2xyxy(x): """Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right.""" y = np.copy(x) y[..., 0] = x[..., 0] - x[..., 2] / 2 # top left x y[..., 1] = x[..., 1] - x[..., 3] / 2 # top left y y[..., 2] = x[..., 0] + x[..., 2] / 2 # bottom right x y[..., 3] = x[..., 1] + x[..., 3] / 2 # bottom right y return y def letterbox(im, new_shape=(640, 640), color=(0, 0, 0), auto=False, scaleFill=False, scaleup=True, stride=32): """Resizes and pads image to new_shape with stride-multiple constraints, returns resized image, ratio, padding.""" shape = im.shape[:2] # current shape [height, width] if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) if not scaleup: # only scale down, do not scale up (for better val mAP) r = min(r, 1.0) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding if auto: # minimum rectangle dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding elif scaleFill: # stretch dw, dh = 0.0, 0.0 new_unpad = (new_shape[1], new_shape[0]) ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border for 2 sides # im = cv2.copyMakeBorder(im, 0, int(dh), 0, int(dw), cv2.BORDER_CONSTANT, value=color) # add border for right and bottom return im, ratio, (dw, dh) def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None): """Rescales (xyxy) bounding boxes from img1_shape to img0_shape, optionally using provided `ratio_pad`.""" if ratio_pad is None: # calculate from img0_shape gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding # pad = (0, 0) else: gain = ratio_pad[0][0] pad = ratio_pad[1] boxes[..., [0, 2]] -= pad[0] # x padding boxes[..., [1, 3]] -= pad[1] # y padding boxes[..., :4] /= gain return boxes def nms(boxes, iou_thresh=0.65): xmin, ymin, xmax, ymax = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] score = boxes[:, 4] areas = (xmax - xmin + 1)*(ymax - ymin + 1) order = score.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xxmin = np.maximum(xmin[i], xmin[order[1:]]) yymin = np.maximum(ymin[i], ymin[order[1:]]) xxmax = np.minimum(xmax[i], xmax[order[1:]]) yymax = np.minimum(ymax[i], ymax[order[1:]]) w = np.maximum(0, xxmax - xxmin + 1) h = np.maximum(0, yymax - yymin + 1) inter = w * h iou = inter / (areas[i] + areas[order[1:]] - inter) order = order[np.where(iou <= iou_thresh)[0] + 1] #索引需要加1 return boxes[keep, :] def nms_multi(boxes, conf_thresh=0.25, iou_thresh=0.65, max_num=300): if len(boxes) == 0: return boxes boxes = boxes[np.where(boxes[:, 4] > conf_thresh)] result = list() cls_score = boxes[:, 5:] max_cls_index = np.argmax(cls_score, axis=-1) max_cls_score = np.max(cls_score, axis=-1) dets = np.concatenate([ boxes[:, :4], # 0:3 坐标 boxes[:, 4:5], # 4 obj max_cls_score[:, np.newaxis], # 5 最大类别分数 max_cls_index[:, np.newaxis]], axis=-1) # 6 类别 dets[:, 4] = dets[:, 4] * dets[:, 5] max_det = dets[:, 4].argsort()[::-1][:max_num] dets = dets[max_det, :] dets = dets[:, [0, 1, 2, 3, 4, 6]] unique_label = np.unique(max_cls_index) for c in unique_label: det = dets[dets[:, -1] == c] nmsed_det = nms(det, iou_thresh=iou_thresh) if len(nmsed_det): result.append(nmsed_det) if len(result): result = np.concatenate(result, axis=0) return result else: return [] def onnx_inference(opt): session = ort.InferenceSession(opt.model) input_name = session.get_inputs()[0].name output_name = [output.name for output in session.get_outputs()] print(f"Input_name: {input_name}, Output_name: {output_name}") img = cv2.imread(f'{opt.img}') t1 = time.time() img_letter, ratio, (dw, dh) = letterbox(img, opt.imgsz) # h w c input_data = np.expand_dims(img_letter, axis=0)[..., ::-1].transpose((0, 3, 1, 2)) t2 = time.time() print(f"Preprocess time: {(t2-t1)*1000:.2f} ms") t3 = time.time() outputs = session.run(output_name, {input_name:input_data}) t4 = time.time() print(f"Inference time: {(t4-t3)*1000:.2f} ms") num_anchor = len(opt.anchors[0]) // 2 channel = len(opt.classes) + 5 predictions = list() for i, output in enumerate(outputs): bs, _, ny, nx = output.shape # x(bs,255,20,20) to x(bs,3,20,20,85) output = sigmoid(output.reshape(bs, num_anchor, channel, ny, nx).transpose(0, 1, 3, 4, 2)) grid, anchor_grid = make_grid(nx, ny, i, opt.strides, opt.anchors) xy, wh, conf = output[..., :2], output[..., 2:4], output[..., 4:] xy = (xy * 2 + grid) * opt.strides[i] # xy wh = (wh * 2) ** 2 * anchor_grid # wh prediction = np.concatenate((xy, wh, conf), 4) prediction = prediction.reshape(bs, num_anchor * nx * ny, channel) prediction = xywh2xyxy(prediction) prediction[..., 0:4:2] = np.clip(prediction[..., 0:4:2], a_min=0, a_max=opt.imgsz[1]) prediction[..., 1:4:2] = np.clip(prediction[..., 1:4:2], a_min=0, a_max=opt.imgsz[0]) predictions.append(prediction) predictions = np.concatenate(predictions, axis=1).squeeze() # predictions format: [x1, y1, x2, y2, obj, cls_score, obj*cls_score, label] predictions = nms_multi(predictions) #TODO multi label for one box if len(predictions) > 0: predictions[:, :4] = scale_boxes(img_letter.shape[:2], predictions[:, :4], img.shape).round() else: print("no target") print(f"Total detect {len(predictions)} objects") for i, (x1, y1, x2, y2, conf, label) in enumerate(predictions): print(f"{i}: {opt.classes[int(label)]}\t {conf:.3f} [{round(x1,1)}, {round(y1,1)}, {round(x2,1)}, {round(y2,1)}]") if opt.vis: box_xyxy = predictions[:, :4].astype(np.int32) scores = predictions[:, -2] labels = predictions[:, -1].astype(np.int32) for (x1, y1, x2, y2), score, label in zip(box_xyxy, scores, labels): img = cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2) text_size = cv2.getTextSize(f"{opt.classes[label]}:{score:.3f}", cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) if (y1 - text_size[0][1]) < 0: textxy = (x1, y1 + text_size[0][1]) else: textxy = (x1, y1) cv2.putText(img, f"{opt.classes[label]}:{score:.3f}", textxy, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) cv2.imwrite(f'{opt.save_name}', img) def parse_opt(): parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, default="./AX650/ax_ax650_hel_algo_V1.0.0.axmodel", help="onnx model path") parser.add_argument("--img", type=str, default="./test.jpg", help="img_path") parser.add_argument("--anchors", type=float, default=[[31,28, 38,32, 60,83],[84,110, 133,118, 200,113]], help="anchor based anchors") parser.add_argument("--strides", type=float, default=[8, 16], help="model strides") parser.add_argument("--imgsz", "--img-size", nargs="+", type=int, default=[256, 192], help="inference size h,w") parser.add_argument("--classes", type=str, default=["helmet", "head", "e-bike", "bike"], help="classes num") parser.add_argument("--conf-thres", type=float, default=0.25, help="confidence threshold") parser.add_argument("--iou-thres", type=float, default=0.45, help="NMS IoU threshold") parser.add_argument("--max-det", type=int, default=50, help="maximum detections per image") parser.add_argument("--vis", default=True, help="visualize detect result") parser.add_argument("--save_name", type=str, default="./out.jpg", help="detect img save path") opt = parser.parse_args() return opt if __name__ == "__main__": opt = parse_opt() onnx_inference(opt)