教育咨询网站模板,学校网站建设有限公司,ps在线,如何用自己电脑做网站目录引言计算机视觉基础图像的数字化表示图像预处理卷积神经网络#xff08;CNN#xff09;基础卷积操作池化层激活函数构建完整的CNN模型目标检测基础边界框表示非极大值抑制#xff08;NMS#xff09;实战项目#xff1a;简单的目标检测器数据准备简化的YOLO风格检测器训…目录引言计算机视觉基础图像的数字化表示图像预处理卷积神经网络CNN基础卷积操作池化层激活函数构建完整的CNN模型目标检测基础边界框表示非极大值抑制NMS实战项目简单的目标检测器数据准备简化的YOLO风格检测器训练目标检测器可视化检测结果高级目标检测算法简介R-CNN系列SSDSingle Shot MultiBox Detector实际应用示例视频中的目标检测性能评估mAPMean Average Precision计算总结学习建议引言计算机视觉Computer Vision, CV是人工智能领域的一个重要分支致力于让计算机能够理解和解释视觉信息。从图像分类到目标检测从图像分割到场景理解计算机视觉技术已经广泛应用于自动驾驶、医疗诊断、安防监控等多个领域。本文将深入探讨卷积神经网络CNN的原理及其在目标检测中的应用。计算机视觉基础图像的数字化表示计算机中的图像由像素组成每个像素具有特定的颜色值。对于灰度图像每个像素只有一个值0-255表示亮度对于彩色图像通常使用RGB三个通道表示颜色。importnumpyasnpimportmatplotlib.pyplotasplt# 创建示例图像defcreate_sample_image():# 创建一个简单的彩色图像 (100x100像素)imagenp.zeros((100,100,3),dtypenp.uint8)# 红色方块image[20:40,20:40][255,0,0]# 绿色方块image[60:80,20:40][0,255,0]# 蓝色方块image[40:60,60:80][0,0,255]# 黄色方块image[60:80,60:80][255,255,0]returnimage# 创建并显示图像sample_imagecreate_sample_image()plt.figure(figsize(8,4))plt.subplot(1,2,1)plt.imshow(sample_image)plt.title(原始图像)plt.axis(off)# 显示RGB通道plt.subplot(1,2,2)plt.imshow(sample_image)plt.title(RGB彩色图像)plt.axis(off)plt.show()# 查看图像的基本属性print(f图像尺寸:{sample_image.shape})print(f图像数据类型:{sample_image.dtype})print(f像素值范围:{sample_image.min()}-{sample_image.max()})图像预处理classImagePreprocessor:def__init__(self):passdefresize(self,image,new_size):调整图像大小fromskimage.transformimportresizereturnresize(image,new_size,anti_aliasingTrue)defnormalize(self,image):归一化图像到[0,1]范围returnimage/255.0defcenter_crop(self,image,crop_size):中心裁剪h,wimage.shape[:2]start_h(h-crop_size[0])//2start_w(w-crop_size[1])//2returnimage[start_h:start_hcrop_size[0],start_w:start_wcrop_size[1]]defaugment(self,image):数据增强augmented[]# 原始图像augmented.append(image)# 水平翻转augmented.append(np.fliplr(image))# 随机旋转anglenp.random.uniform(-30,30)fromskimage.transformimportrotate rotatedrotate(image,angle,modereflect)augmented.append(rotated)# 亮度调整brightness_factornp.random.uniform(0.8,1.2)brightenednp.clip(image*brightness_factor,0,1)augmented.append(brightened)returnaugmented# 使用示例preprocessorImagePreprocessor()# 创建示例图像imagenp.random.rand(256,256,3)# 随机彩色图像# 预处理流程resizedpreprocessor.resize(image,(224,224))normalizedpreprocessor.normalize(resized)croppedpreprocessor.center_crop(normalized,(200,200))augmentedpreprocessor.augment(cropped)print(f原始图像大小:{image.shape})print(f预处理后大小:{normalized.shape})print(f增强后的图像数量:{len(augmented)})卷积神经网络CNN基础卷积操作卷积是CNN的核心操作通过卷积核滤波器在图像上滑动来提取特征。importnumpyasnpclassConv2D:def__init__(self,in_channels,out_channels,kernel_size,stride1,padding0):self.in_channelsin_channels self.out_channelsout_channels self.kernel_sizekernel_sizeifisinstance(kernel_size,tuple)else(kernel_size,kernel_size)self.stridestride self.paddingpadding# 初始化权重和偏置scalenp.sqrt(2.0/(in_channels*kernel_size*kernel_size))self.weightsnp.random.randn(out_channels,in_channels,*self.kernel_size)*scale self.biasnp.zeros(out_channels)defforward(self,x):前向传播batch_size,in_channels,height,widthx.shape kh,kwself.kernel_size# 计算输出尺寸out_height(height2*self.padding-kh)//self.stride1out_width(width2*self.padding-kw)//self.stride1# 填充输入ifself.padding0:x_paddednp.pad(x,((0,0),(0,0),(self.padding,self.padding),(self.padding,self.padding)),modeconstant)else:x_paddedx# 初始化输出outputnp.zeros((batch_size,self.out_channels,out_height,out_width))# 执行卷积操作forbinrange(batch_size):forocinrange(self.out_channels):forohinrange(out_height):forowinrange(out_width):# 计算卷积窗口的起始和结束位置h_startoh*self.stride h_endh_startkh w_startow*self.stride w_endw_startkw# 提取窗口并计算卷积windowx_padded[b,:,h_start:h_end,w_start:w_end]output[b,oc,oh,ow]np.sum(window*self.weights[oc])self.bias[oc]returnoutputdefbackward(self,x,grad_output,learning_rate):反向传播batch_size,in_channels,height,widthx.shape kh,kwself.kernel_size _,_,out_height,out_widthgrad_output.shape# 初始化梯度grad_weightsnp.zeros_like(self.weights)grad_biasnp.zeros_like(self.bias)grad_inputnp.zeros_like(x)# 计算梯度forbinrange(batch_size):forocinrange(self.out_channels):forohinrange(out_height):forowinrange(out_width):# 计算输入梯度h_startoh*self.stride h_endh_startkh w_startow*self.stride w_endw_startkwifh_endheightandw_endwidth:grad_input[b,:,h_start:h_end,w_start:w_end]\ grad_output[b,oc,oh,ow]*self.weights[oc]# 计算权重梯度grad_weights[oc]grad_output[b,oc,oh,ow]*x[b,:,h_start:h_end,w_start:w_end]grad_bias[oc]np.sum(grad_output[b,oc])# 更新参数self.weights-learning_rate*grad_weights self.bias-learning_rate*grad_biasreturngrad_input# 示例使用# 创建输入 (batch_size2, channels3, height32, width32)input_datanp.random.randn(2,3,32,32)# 创建卷积层convConv2D(in_channels3,out_channels16,kernel_size3,stride1,padding1)# 前向传播outputconv.forward(input_data)print(f输入形状:{input_data.shape})print(f输出形状:{output.shape})# 可视化卷积核plt.figure(figsize(12,8))foriinrange(min(16,16)):# 显示前16个卷积核plt.subplot(4,4,i1)# 取第一个输入通道的卷积核kernelconv.weights[i,0]plt.imshow(kernel,cmapgray)plt.title(fKernel{i1})plt.axis(off)plt.suptitle(卷积核可视化)plt.show()池化层池化层用于降低特征图的维度减少计算量并提取主要特征。classMaxPool2D:def__init__(self,kernel_size,strideNone):self.kernel_sizekernel_sizeifisinstance(kernel_size,tuple)else(kernel_size,kernel_size)self.stridestrideifstrideisnotNoneelsekernel_sizedefforward(self,x):前向传播batch_size,channels,height,widthx.shape kh,kwself.kernel_size# 计算输出尺寸out_height(height-kh)//self.stride1out_width(width-kw)//self.stride1outputnp.zeros((batch_size,channels,out_height,out_width))# 执行最大池化forbinrange(batch_size):forcinrange(channels):forohinrange(out_height):forowinrange(out_width):h_startoh*self.stride h_endh_startkh w_startow*self.stride w_endw_startkw output[b,c,oh,ow]np.max(x[b,c,h_start:h_end,w_start:w_end])returnoutputclassAvgPool2D:def__init__(self,kernel_size,strideNone):self.kernel_sizekernel_sizeifisinstance(kernel_size,tuple)else(kernel_size,kernel_size)self.stridestrideifstrideisnotNoneelsekernel_sizedefforward(self,x):前向传播batch_size,channels,height,widthx.shape kh,kwself.kernel_size# 计算输出尺寸out_height(height-kh)//self.stride1out_width(width-kw)//self.stride1outputnp.zeros((batch_size,channels,out_height,out_width))# 执行平均池化forbinrange(batch_size):forcinrange(channels):forohinrange(out_height):forowinrange(out_width):h_startoh*self.stride h_endh_startkh w_startow*self.stride w_endw_startkw output[b,c,oh,ow]np.mean(x[b,c,h_start:h_end,w_start:w_end])returnoutput# 示例使用# 创建输入特征图feature_mapnp.random.randn(1,1,8,8)# 1个通道8x8的特征图# 创建池化层max_poolMaxPool2D(kernel_size2,stride2)avg_poolAvgPool2D(kernel_size2,stride2)# 应用池化max_pooledmax_pool.forward(feature_map)avg_pooledavg_pool.forward(feature_map)print(f原始特征图大小:{feature_map.shape})print(f最大池化后大小:{max_pooled.shape})print(f平均池化后大小:{avg_pooled.shape})# 可视化plt.figure(figsize(12,4))plt.subplot(1,3,1)plt.imshow(feature_map[0,0],cmapgray)plt.title(原始特征图)plt.colorbar()plt.subplot(1,3,2)plt.imshow(max_pooled[0,0],cmapgray)plt.title(最大池化)plt.colorbar()plt.subplot(1,3,3)plt.imshow(avg_pooled[0,0],cmapgray)plt.title(平均池化)plt.colorbar()plt.show()激活函数defrelu(x):ReLU激活函数returnnp.maximum(0,x)defrelu_derivative(x):ReLU导数return(x0).astype(float)defleaky_relu(x,alpha0.01):Leaky ReLU激活函数returnnp.where(x0,x,alpha*x)defleaky_relu_derivative(x,alpha0.01):Leaky ReLU导数returnnp.where(x0,1,alpha)# 可视化激活函数xnp.linspace(-5,5,100)plt.figure(figsize(12,4))plt.subplot(1,3,1)plt.plot(x,relu(x))plt.title(ReLU)plt.grid(True)plt.subplot(1,3,2)plt.plot(x,relu_derivative(x))plt.title(ReLU导数)plt.grid(True)plt.subplot(1,3,3)plt.plot(x,leaky_relu(x))plt.title(Leaky ReLU)plt.grid(True)plt.show()构建完整的CNN模型classCNN:def__init__(self):self.layers[]defadd_conv_layer(self,in_channels,out_channels,kernel_size,stride1,padding0):添加卷积层self.layers.append({type:conv,layer:Conv2D(in_channels,out_channels,kernel_size,stride,padding)})defadd_pool_layer(self,pool_typemax,kernel_size2,strideNone):添加池化层ifpool_typemax:self.layers.append({type:pool,layer:MaxPool2D(kernel_size,stride)})elifpool_typeavg:self.layers.append({type:pool,layer:AvgPool2D(kernel_size,stride)})defadd_activation(self,activation_typerelu):添加激活函数self.layers.append({type:activation,activation:activation_type})defforward(self,x):前向传播activations[x]forlayer_infoinself.layers:iflayer_info[type]conv:xlayer_info[layer].forward(x)eliflayer_info[type]pool:xlayer_info[layer].forward(x)eliflayer_info[type]activation:iflayer_info[activation]relu:xrelu(x)eliflayer_info[activation]leaky_relu:xleaky_relu(x)activations.append(x)returnactivationsdefpredict(self,x):预测activationsself.forward(x)returnactivations[-1]# 构建示例CNNmodelCNN()# 架构输入 - Conv - ReLU - Pool - Conv - ReLU - Poolmodel.add_conv_layer(in_channels3,out_channels16,kernel_size3,padding1)model.add_activation(relu)model.add_pool_layer(max,2)model.add_conv_layer(in_channels16,out_channels32,kernel_size3,padding1)model.add_activation(relu)model.add_pool_layer(max,2)# 创建测试输入test_inputnp.random.randn(1,3,32,32)# 前向传播activationsmodel.forward(test_input)print(各层输出形状:)fori,activationinenumerate(activations):print(fLayer{i}:{activation.shape})目标检测基础目标检测不仅要识别图像中的物体类别还要定位物体的位置。边界框表示classBoundingBox:def__init__(self,x_min,y_min,x_max,y_max,labelNone,confidenceNone):self.x_minx_min self.y_miny_min self.x_maxx_max self.y_maxy_max self.labellabel self.confidenceconfidencepropertydefwidth(self):returnself.x_max-self.x_minpropertydefheight(self):returnself.y_max-self.y_minpropertydefarea(self):returnself.width*self.heightpropertydefcenter(self):return((self.x_minself.x_max)/2,(self.y_minself.y_max)/2)defto_coco_format(self):转换为COCO格式 [x, y, width, height]return[self.x_min,self.y_min,self.width,self.height]defto_yolo_format(self,image_width,image_height):转换为YOLO格式 [x_center, y_center, width, height] (归一化)x_center(self.x_minself.x_max)/2/image_width y_center(self.y_minself.y_max)/2/image_height widthself.width/image_width heightself.height/image_heightreturn[x_center,y_center,width,height]defcalculate_iou(box1,box2):计算两个边界框的IoU# 计算交集x1max(box1.x_min,box2.x_min)y1max(box1.y_min,box2.y_min)x2min(box1.x_max,box2.x_max)y2min(box1.y_max,box2.y_max)ifx2x1ory2y1:return0intersection(x2-x1)*(y2-y1)# 计算并集unionbox1.areabox2.area-intersectionreturnintersection/union# 示例使用box1BoundingBox(10,10,50,50,cat,0.9)box2BoundingBox(30,30,70,70,cat,0.85)print(f边界框1: 中心{box1.center}, 面积{box1.area})print(f边界框2: 中心{box2.center}, 面积{box2.area})print(fIoU:{calculate_iou(box1,box2):.4f})print(fCOCO格式:{box1.to_coco_format()})print(fYOLO格式(假设图像100x100):{box1.to_yolo_format(100,100)})非极大值抑制NMSdefnon_max_suppression(boxes,iou_threshold0.5):非极大值抑制ifnotboxes:return[]# 按置信度排序boxessorted(boxes,keylambdax:x.confidence,reverseTrue)selected_boxes[]whileboxes:# 选择置信度最高的框current_boxboxes.pop(0)selected_boxes.append(current_box)# 移除与当前框IoU过高的框remaining_boxes[]forboxinboxes:ifcalculate_iou(current_box,box)iou_threshold:remaining_boxes.append(box)boxesremaining_boxesreturnselected_boxes# 示例使用# 创建多个重叠的检测框detections[BoundingBox(10,10,50,50,cat,0.95),BoundingBox(12,12,52,52,cat,0.90),BoundingBox(15,15,55,55,cat,0.85),BoundingBox(100,100,150,150,dog,0.88),BoundingBox(102,102,152,152,dog,0.82),]# 应用NMSselectednon_max_suppression(detections,iou_threshold0.5)print(f原始检测框数量:{len(detections)})print(fNMS后保留的框数量:{len(selected)})fori,boxinenumerate(selected):print(f框{i1}:{box.label}, 置信度{box.confidence:.4f})实战项目简单的目标检测器数据准备importnumpyasnpimportcv2classObjectDetectionDataset:def__init__(self,image_size224,grid_size7):self.image_sizeimage_size self.grid_sizegrid_size self.cell_sizeimage_size//grid_size self.num_classes3# 假设3个类别self.num_boxes2# 每个格子预测2个框defgenerate_synthetic_data(self,num_samples1000):生成合成数据用于训练images[]labels[]class_names[circle,rectangle,triangle]for_inrange(num_samples):# 创建空白图像imagenp.zeros((self.image_size,self.image_size,3),dtypenp.uint8)# 创建标签网格label_gridnp.zeros((self.grid_size,self.grid_size,5*self.num_boxesself.num_classes))# 随机放置1-3个物体num_objectsnp.random.randint(1,4)for_inrange(num_objects):# 随机选择类别class_idxnp.random.randint(self.num_classes)class_nameclass_names[class_idx]# 随机位置和大小sizenp.random.randint(20,60)xnp.random.randint(size,self.image_size-size)ynp.random.randint(size,self.image_size-size)# 绘制物体ifclass_namecircle:cv2.circle(image,(x,y),size//2,(255,0,0),-1)elifclass_namerectangle:cv2.rectangle(image,(x-size//2,y-size//2),(xsize//2,ysize//2),(0,255,0),-1)else:# trianglepointsnp.array([[x,y-size//2],[x-size//2,ysize//2],[xsize//2,ysize//2]],np.int32)cv2.fillPoly(image,[points],(0,0,255))# 创建边界框boxBoundingBox(x-size//2,y-size//2,xsize//2,ysize//2,class_name,1.0)# 转换为YOLO格式并放入网格self._place_box_in_grid(box,class_idx,label_grid)images.append(image)labels.append(label_grid)returnnp.array(images),np.array(labels)def_place_box_in_grid(self,box,class_idx,label_grid):将边界框放入网格中# 计算中心点所在的格子center_x,center_ybox.center grid_xint(center_x/self.cell_size)grid_yint(center_y/self.cell_size)if0grid_xself.grid_sizeand0grid_yself.grid_size:# 计算相对于格子的坐标x_offset(center_x/self.cell_size)-grid_x y_offset(center_y/self.cell_size)-grid_y# 计算相对于图像的宽高width_offsetbox.width/self.image_size height_offsetbox.height/self.image_size# 将信息放入网格简化版只使用第一个预测框label_grid[grid_y,grid_x,0]1# 置信度label_grid[grid_y,grid_x,1]x_offset label_grid[grid_y,grid_x,2]y_offset label_grid[grid_y,grid_x,3]width_offset label_grid[grid_y,grid_x,4]height_offset# 设置类别label_grid[grid_y,grid_x,5*self.num_boxesclass_idx]1# 生成数据datasetObjectDetectionDataset()images,labelsdataset.generate_synthetic_data(100)print(f图像数据形状:{images.shape})print(f标签数据形状:{labels.shape})# 可视化几个样本plt.figure(figsize(12,8))foriinrange(min(6,len(images))):plt.subplot(2,3,i1)plt.imshow(images[i])plt.title(fSample{i1})plt.axis(off)plt.show()简化的YOLO风格检测器classYOLODetector:def__init__(self,input_size224,grid_size7,num_classes3):self.input_sizeinput_size self.grid_sizegrid_size self.num_classesnum_classes self.num_boxes2# 构建CNN骨干网络self.backboneCNN()self._build_backbone()# 检测头self._build_detection_head()def_build_backbone(self):构建骨干网络# 输入: 3 x 224 x 224self.backbone.add_conv_layer(3,32,3,padding1)self.backbone.add_activation(relu)self.backbone.add_pool_layer(max,2)# 112 x 112self.backbone.add_conv_layer(32,64,3,padding1)self.backbone.add_activation(relu)self.backbone.add_pool_layer(max,2)# 56 x 56self.backbone.add_conv_layer(64,128,3,padding1)self.backbone.add_activation(relu)self.backbone.add_pool_layer(max,2)# 28 x 28self.backbone.add_conv_layer(128,256,3,padding1)self.backbone.add_activation(relu)self.backbone.add_pool_layer(max,2)# 14 x 14self.backbone.add_conv_layer(256,512,3,padding1)self.backbone.add_activation(relu)self.backbone.add_pool_layer(max,2)# 7 x 7def_build_detection_head(self):构建检测头# 展平后的特征维度: 512 * 7 * 7flattened_size512*self.grid_size*self.grid_size# 全连接层self.fc1np.random.randn(flattened_size,1024)*0.01self.fc1_biasnp.zeros(1024)# 输出层: 每个格子输出 (5 * num_boxes num_classes)output_sizeself.grid_size*self.grid_size*(5*self.num_boxesself.num_classes)self.fc2np.random.randn(1024,output_size)*0.01self.fc2_biasnp.zeros(output_size)defforward(self,x):前向传播# 骨干网络activationsself.backbone.forward(x)featuresactivations[-1]# 展平特征batch_sizefeatures.shape[0]flattenedfeatures.reshape(batch_size,-1)# 全连接层fc1_outrelu(np.dot(flattened,self.fc1)self.fc1_bias)fc2_outnp.dot(fc1_out,self.fc2)self.fc2_bias# 重塑为网格格式outputfc2_out.reshape(batch_size,self.grid_size,self.grid_size,-1)# 最后一维: 5*num_boxes num_classesreturnoutputdefdecode_predictions(self,predictions,conf_threshold0.5):解码预测结果batch_sizepredictions.shape[0]all_detections[]forbinrange(batch_size):detections[]foriinrange(self.grid_size):forjinrange(self.grid_size):cell_predpredictions[b,i,j]# 解码边界框简化版只考虑第一个框confidencesigmoid(cell_pred[0])ifconfidenceconf_threshold:# 相对于格子的坐标x_offsetsigmoid(cell_pred[1])y_offsetsigmoid(cell_pred[2])# 宽高w_offsetcell_pred[3]h_offsetcell_pred[4]# 转换为绝对坐标cell_sizeself.input_size/self.grid_size center_x(jx_offset)*cell_size center_y(iy_offset)*cell_size widthnp.exp(w_offset)*cell_size heightnp.exp(h_offset)*cell_size# 计算边界框坐标x_mincenter_x-width/2y_mincenter_y-height/2x_maxcenter_xwidth/2y_maxcenter_yheight/2# 获取类别class_probscell_pred[5*self.num_boxes:]class_idxnp.argmax(class_probs)# 创建检测框boxBoundingBox(x_min,y_min,x_max,y_max,labelstr(class_idx),confidenceconfidence)detections.append(box)# 应用NMSifdetections:detectionsnon_max_suppression(detections)all_detections.append(detections)returnall_detectionsdefsigmoid(x):Sigmoid函数return1/(1np.exp(-x))# 创建检测器detectorYOLODetector()# 测试前向传播test_inputnp.random.randn(2,3,224,224)predictionsdetector.forward(test_input)print(f输入形状:{test_input.shape})print(f预测输出形状:{predictions.shape})# 解码预测detectionsdetector.decode_predictions(predictions)print(f检测到的框数量:{len(detections[0])})训练目标检测器classObjectDetectionTrainer:def__init__(self,model,learning_rate0.001):self.modelmodel self.learning_ratelearning_rate self.datasetObjectDetectionDataset()defcompute_loss(self,predictions,targets):计算损失简化版# MSE损失mse_lossnp.mean((predictions-targets)**2)returnmse_lossdeftrain_step(self,images,targets):单步训练# 前向传播predictionsself.model.forward(images)# 计算损失lossself.compute_loss(predictions,targets)# 简化的参数更新实际需要完整的反向传播# 这里仅作演示returnlossdeftrain(self,epochs10,batch_size8):训练模型# 生成训练数据train_images,train_labelsself.dataset.generate_synthetic_data(1000)print(开始训练...)forepochinrange(epochs):total_loss0num_batcheslen(train_images)//batch_sizeforiinrange(0,len(train_images),batch_size):batch_imagestrain_images[i:ibatch_size]batch_targetstrain_labels[i:ibatch_size]# 归一化图像batch_imagesbatch_images/255.0# 训练步骤lossself.train_step(batch_images,batch_targets)total_lossloss avg_losstotal_loss/num_batchesprint(fEpoch{epoch1}/{epochs}, Average Loss:{avg_loss:.4f})# 训练检测器trainerObjectDetectionTrainer(detector)trainer.train(epochs5)可视化检测结果defvisualize_detections(image,detections,class_namesNone):可视化检测结果ifclass_namesisNone:class_names[circle,rectangle,triangle]plt.figure(figsize(10,10))plt.imshow(image)forboxindetections:# 绘制边界框rectplt.Rectangle((box.x_min,box.y_min),box.width,box.height,fillFalse,colorred,linewidth2)plt.gca().add_patch(rect)# 添加标签ifbox.labelisnotNone:class_idxint(box.label)ifclass_idxlen(class_names):label_textf{class_names[class_idx]}:{box.confidence:.2f}else:label_textfClass{class_idx}:{box.confidence:.2f}plt.text(box.x_min,box.y_min-5,label_text,colorred,fontsize12,bboxdict(facecolorwhite,alpha0.7))plt.axis(off)plt.show()# 创建测试图像test_imagenp.zeros((224,224,3),dtypenp.uint8)cv2.circle(test_image,(100,100),30,(255,0,0),-1)cv2.rectangle(test_image,(150,50),(200,100),(0,255,0),-1)# 模拟检测结果simulated_detections[BoundingBox(70,70,130,130,0,0.95),# circleBoundingBox(150,50,200,100,1,0.88),# rectangle]# 可视化visualize_detections(test_image,simulated_detections)高级目标检测算法简介R-CNN系列classRCNNConcept:R-CNN系列的概念性实现def__init__(self):self.regions[]# 区域提议defselective_search(self,image):选择性搜索生成区域提议简化版# 实际实现使用更复杂的算法h,wimage.shape[:2]proposals[]# 生成多尺度的窗口scales[0.5,0.75,1.0,1.25,1.5]aspect_ratios[0.5,1.0,2.0]forscaleinscales:forratioinaspect_ratios:widthint(w*scale)heightint(width*ratio)forxinrange(0,w,width//4):foryinrange(0,h,height//4):x_endmin(xwidth,w)y_endmin(yheight,h)proposals.append(BoundingBox(x,y,x_end,y_end))returnproposalsdefclassify_regions(self,image,regions):对每个区域进行分类classifications[]forregioninregions:# 提取区域roiimage[region.y_min:region.y_max,region.x_min:region.x_max]# 调整大小实际应用中使用更复杂的特征提取roi_resizedcv2.resize(roi,(224,224))# 分类这里使用随机分类作为示例class_probsnp.random.rand(3)class_idxnp.argmax(class_probs)confidenceclass_probs[class_idx]classifications.append({box:region,class:str(class_idx),confidence:confidence})returnclassifications# 演示R-CNN概念rcnnRCNNConcept()regionsrcnn.selective_search(test_image)print(f生成了{len(regions)}个区域提议)# 选择部分区域进行分类sample_regionsregions[:10]classificationsrcnn.classify_regions(test_image,sample_regions)print(\n分类结果示例:)fori,clsinenumerate(classifications[:5]):print(f区域{i1}: 类别{cls[class]}, 置信度{cls[confidence]:.4f})SSDSingle Shot MultiBox DetectorclassSSDConcept:SSD的概念性实现def__init__(self,image_size300):self.image_sizeimage_size self.feature_maps[(38,38),# Conv4_3(19,19),# Conv7(10,10),# Conv8_2(5,5),# Conv9_2(3,3),# Conv10_2(1,1)# Conv11_2]self.default_boxesself._generate_default_boxes()def_generate_default_boxes(self):生成默认框default_boxes[]# 每个特征图的尺度和长宽比scales[0.1,0.2,0.37,0.54,0.71,0.88]aspect_ratios[[1,2,1/2],[1,2,3,1/2,1/3],[1,2,3,1/2,1/3],[1,2,3,1/2,1/3],[1,2,1/2],[1,2,1/2]]fork,(h,w)inenumerate(self.feature_maps):foriinrange(h):forjinrange(w):# 中心点cx(j0.5)/w cy(i0.5)/hforratioinaspect_ratios[k]:# 计算宽高w_rationp.sqrt(ratio)h_ratio1/w_ratio# 计算框的宽高box_wscales[k]*w_ratio box_hscales[k]*h_ratio# 添加默认框default_boxes.append([cx,cy,box_w,box_h])returnnp.array(default_boxes)defdecode_boxes(self,loc_preds,variances[0.1,0.2]):解码预测的边界框# loc_preds: [N, num_default_boxes, 4] (cx, cy, w, h)decoded_boxes[]forpredinloc_preds:# 转换为实际坐标cxpred[:,0]*self.default_boxes[:,2]*variances[0]self.default_boxes[:,0]cypred[:,1]*self.default_boxes[:,3]*variances[0]self.default_boxes[:,1]wself.default_boxes[:,2]*np.exp(pred[:,2]*variances[1])hself.default_boxes[:,3]*np.exp(pred[:,3]*variances[1])# 转换为 (x_min, y_min, x_max, y_max)x_min(cx-w/2)*self.image_size y_min(cy-h/2)*self.image_size x_max(cxw/2)*self.image_size y_max(cyh/2)*self.image_size decoded_boxes.append(np.stack([x_min,y_min,x_max,y_max],axis1))returndecoded_boxes# 演示SSD概念ssdSSDConcept()print(f生成的默认框数量:{len(ssd.default_boxes)})# 模拟预测loc_predsnp.random.randn(1,len(ssd.default_boxes),4)*0.1decodedssd.decode_boxes(loc_preds)print(f解码后的边界框形状:{decoded[0].shape})实际应用示例视频中的目标检测defdetect_objects_in_video(video_path,output_path,detector):在视频中检测物体capcv2.VideoCapture(video_path)# 获取视频属性fpsint(cap.get(cv2.CAP_PROP_FPS))widthint(cap.get(cv2.CAP_PROP_FRAME_WIDTH))heightint(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))# 创建视频写入器fourcccv2.VideoWriter_fourcc(*mp4v)outcv2.VideoWriter(output_path,fourcc,fps,(width,height))frame_count0whileTrue:ret,framecap.read()ifnotret:break# 调整帧大小frame_resizedcv2.resize(frame,(detector.input_size,detector.input_size))# 转换为RGBframe_rgbcv2.cvtColor(frame_resized,cv2.COLOR_BGR2RGB)# 归一化frame_normalizedframe_rgb/255.0# 添加批次维度input_tensorframe_normalized[np.newaxis,...]# 转换维度顺序 (H, W, C) - (N, C, H, W)input_tensorinput_tensor.transpose(0,3,1,2)# 检测物体predictionsdetector.forward(input_tensor)detectionsdetector.decode_predictions(predictions)[0]# 在原始帧上绘制检测结果forboxindetections:# 缩放坐标到原始帧大小scale_xwidth/detector.input_size scale_yheight/detector.input_size x_minint(box.x_min*scale_x)y_minint(box.y_min*scale_y)x_maxint(box.x_max*scale_x)y_maxint(box.y_max*scale_y)# 绘制边界框cv2.rectangle(frame,(x_min,y_min),(x_max,y_max),(0,255,0),2)# 添加标签labelfClass{box.label}:{box.confidence:.2f}cv2.putText(frame,label,(x_min,y_min-10),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,255,0),2)# 写入帧out.write(frame)frame_count1ifframe_count%300:print(f已处理{frame_count}帧)cap.release()out.release()print(视频处理完成!)# 使用示例需要实际的视频文件# detect_objects_in_video(input_video.mp4, output_video.mp4, detector)性能评估mAPMean Average Precision计算classDetectionEvaluator:def__init__(self,iou_threshold0.5):self.iou_thresholdiou_thresholddefcalculate_ap(self,predictions,ground_truths,class_name):计算单个类别的平均精度# 按置信度排序predictionssorted(predictions,keylambdax:x.confidence,reverseTrue)tpnp.zeros(len(predictions))fpnp.zeros(len(predictions))num_gtlen(ground_truths)fori,predinenumerate(predictions):max_iou0best_gt_idx-1forj,gtinenumerate(ground_truths):ifgt.labelclass_name:ioucalculate_iou(pred,gt)ifioumax_iou:max_iouiou best_gt_idxjifmax_iouself.iou_threshold:# 检查是否已匹配ifnotgetattr(ground_truths[best_gt_idx],matched,False):tp[i]1ground_truths[best_gt_idx].matchedTrueelse:fp[i]1else:fp[i]1# 计算精度和召回率fp_cumsumnp.cumsum(fp)tp_cumsumnp.cumsum(tp)precisionstp_cumsum/(tp_cumsumfp_cumsum1e-10)recallstp_cumsum/(num_gt1e-10)# 计算APapself._calculate_ap_voc(precisions,recalls)returnapdef_calculate_ap_voc(self,precisions,recalls):使用VOC 2007方法计算AP# 在11个召回率点上插值recall_levelsnp.linspace(0,1,11)ap0forlevelinrecall_levels:# 找到召回率大于等于该水平的最大精度maskrecallslevelifnp.any(mask):pnp.max(precisions[mask])app/11returnapdefcalculate_map(self,predictions_list,ground_truths_list,class_names):计算mAPaps[]forclass_nameinclass_names:class_predictions[]class_ground_truths[]forpredictions,ground_truthsinzip(predictions_list,ground_truths_list):# 筛选当前类别的预测和真实标注class_predictions.extend([pforpinpredictionsifp.labelclass_name])class_ground_truths.extend([gforginground_truthsifg.labelclass_name])# 计算当前类别的APifclass_ground_truths:apself.calculate_ap(class_predictions,class_ground_truths,class_name)aps.append(ap)print(f类别{class_name}: AP {ap:.4f})# 计算mAPmap_scorenp.mean(aps)ifapselse0print(fmAP:{map_score:.4f})returnmap_score# 模拟评估evaluatorDetectionEvaluator()# 模拟预测和真实标注predictions[BoundingBox(10,10,50,50,cat,0.9),BoundingBox(100,100,150,150,dog,0.8),BoundingBox(200,200,250,250,cat,0.7),]ground_truths[BoundingBox(12,12,52,52,cat),BoundingBox(102,102,152,152,dog),BoundingBox(200,200,250,250,cat),BoundingBox(300,300,350,350,bird),]# 计算mAPclass_names[cat,dog,bird]map_scoreevaluator.calculate_map([predictions],[ground_truths],class_names)总结本文深入探讨了计算机视觉中的卷积神经网络和目标检测技术涵盖了基础概念图像表示、预处理等基础知识CNN核心组件卷积、池化、激活函数的原理和实现目标检测基础边界框表示、IoU计算、NMS等关键技术实战项目从零实现一个简化的YOLO风格检测器高级算法R-CNN系列、SSD等先进算法的概念实际应用视频目标检测、性能评估等计算机视觉是一个快速发展的领域新的算法和技术不断涌现。从传统的手工特征到深度学习的端到端学习目标检测技术已经取得了巨大进步。掌握CNN的原理和目标检测的核心技术对于深入理解计算机视觉至关重要。学习建议深入研究ResNet、DenseNet等经典架构学习实例分割Mask R-CNN技术探索目标跟踪算法了解3D目标检测和点云处理实践实际项目参与Kaggle等竞赛