어떻게 배칭 추론이 되는지 bs 변수 동작 확인

[image detection]

# image detection

## od_inference 파일의 43라인
batch_size=1,  # batch size

## od_inference 파일의 85라인
model.warmup(imgsz=(1 if pt else batch_size, 3, imgsz, imgsz))  # warmup

## common.py 파일에서 batch size 1에 대해 추론을 한번 실행하여 모델 warmup
def warmup(self, imgsz=(1, 3, 640, 640)):
        # Warmup model by running inference once
        warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb
        if any(warmup_types) and self.device.type != 'cpu':
            im = torch.zeros(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device)  # input
            for _ in range(2 if self.jit else 1):  #
                self.forward(im)  # warmup

## od_inference 파일의 114라인
for batch_i, (im, targets, paths, shapes) in enumerate(pbar):

## od_inference 파일의 127라인
out, train_out = model(im, augment=augment, val=True)  # inference, loss outputs

[video detection]

# video detection

## video_detection.py 파일의 47라인에서 모델 로드 (DetectMultiBackend 함수 사용)
model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)

## common.py 파일의 308라인에서 DetectMultiBackend 함수를 실행시키며,
## 이때 TensorFlow SavedModel, edgetpu 모델을 불러오기 위해 사용되는 device는 cpu사용 
def __init__(self, weights='yolov5s.pt', device=torch.device('cpu'), dnn=False, data=None, fp16=False):
YOLOv5 \\U0001f680 2022-5-19 torch 1.10.2 CPU (추론 실행시 출력되는 내용)

## common.py 파일의 279, 369, 387라인에서 DetectMultiBackend 함수를 실행시키며,
## TensorFlow SavedModel, edgetpu 모델을 불러와 로드시킴
## 그리고 edgetpu의 경우 바로 tpu device 라이브러리를 할당하여 추론에 사용하도록 환경 설정
## TensroFlow SavedModel의 경우 추론시 사용하는 device 설정은 아래에서 진행
else:  # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU)
    if saved_model:  # SavedModel
      LOGGER.info(f'Loading {w} for TensorFlow SavedModel inference...')
      import tensorflow as tf
      keras = False  # assume TF1 saved_model
      model = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w)

    if edgetpu:  # Edge TPU <https://coral.ai/software/#edgetpu-runtime>
      LOGGER.info(f'Loading {w} for TensorFlow Lite Edge TPU inference...')
      delegate = {
      'Linux': 'libedgetpu.so.1',
      'Darwin': 'libedgetpu.1.dylib',
      'Windows': 'edgetpu.dll'}[platform.system()]
      interpreter = Interpreter(model_path=w, experimental_delegates=[load_delegate(delegate)])

## video_detection.py 파일의 55라인에서 bs 변수에 batch size 지정
bs = 1  # batch_size

## video_detection.py 파일의 60라인에서 model warmup 함수 실행을 위해 
## batch size, channel,image size 전달
model.warmup(imgsz=(1 if pt else bs, 3, *imgsz))  # warmup

## common.py 파일의 472라인에서 batch size 1에 대해 추론을 한번 실행하여 모델 warmup
## 장비에서 처음 추론 시간이 길기 때문에 coldstart를 줄이기 위해 사용되는 함수로 확인
## warmup 함수가 각 모델의 device에서 forward 함수를 실행시켜 추론 진행
def warmup(self, imgsz=(1, 3, 640, 640)):
        # Warmup model by running inference once
        warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb
        if any(warmup_types) and self.device.type != 'cpu':
            im = torch.zeros(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device)  # input
            for _ in range(2 if self.jit else 1):  
                self.forward(im)  # warmup

## device는 TensroFlow SavedModel의 경우 추론을 위해 torch_utils.py 파일의 52, 59라인에서 
## select_device 함수를 실행시켜 cuda device 사용
device = select_device(device)

def select_device(device='', batch_size=0, newline=True):
    # device = 'cpu' or '0' or '0,1,2,3'
    s = f'YOLOv5 🚀 {git_describe() or file_update_date()} torch {torch.__version__} '  # string
    device = str(device).strip().lower().replace('cuda:', '')  # to string, 'cuda:0' to '0'
    cpu = device == 'cpu'
    if cpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # force torch.cuda.is_available() = False
    elif device:  # non-cpu device requested
        os.environ['CUDA_VISIBLE_DEVICES'] = device  # set environment variable - must be before assert is_available()
        assert torch.cuda.is_available() and torch.cuda.device_count() >= len(device.replace(',', '')), \\
            f"Invalid CUDA '--device {device}' requested, use '--device cpu' or pass valid CUDA device(s)"

    cuda = not cpu and torch.cuda.is_available()
    if cuda:
        devices = device.split(',') if device else '0'  # range(torch.cuda.device_count())  # i.e. 0,1,6,7
        n = len(devices)  # device count
        if n > 1 and batch_size > 0:  # check batch_size is divisible by device_count
            assert batch_size % n == 0, f'batch-size {batch_size} not multiple of GPU count {n}'
        space = ' ' * (len(s) + 1)
        for i, d in enumerate(devices):
            p = torch.cuda.get_device_properties(i)
            s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\\n"  # bytes to MB
    else:
        s += 'CPU\\n'

    if not newline:
        s = s.rstrip()
    LOGGER.info(s.encode().decode('ascii', 'ignore') if platform.system() == 'Windows' else s)  # emoji-safe
    return torch.device('cuda:0' if cuda else 'cpu')

## common.py 파일의 410, 448, 454라인에서 forward 함수를 실행하여
## 입력 데이터의 형식을 모델에 맞게 맞추고 추론 실행
if self.saved_model:  # SavedModel
  y = (self.model(im, training=False) if self.keras else self.model(im)).numpy()
else:  # Lite or Edge TPU
  input, output = self.input_details[0], self.output_details[0]
  int8 = input['dtype'] == np.uint8  # is TFLite quantized uint8 model
  if int8:
		scale, zero_point = input['quantization']
	  im = (im / scale + zero_point).astype(np.uint8)  # de-scale
	  self.interpreter.set_tensor(input['index'], im)
	  self.interpreter.invoke() # TPU에서 추론
	  y = self.interpreter.get_tensor(output['index'])
  if int8:
    scale, zero_point = output['quantization']
    y = (y.astype(np.float32) - zero_point) * scale  # re-scale
 y[..., :4] *= [w, h, w, h]  # xywh normalized to pixels

## video_detection.py 파일의 45, 64라인에서 batch size 1로 한번 추론한 모델을 사용하여
## dataset 만큼 추론 진행하며 입력으로 dataset image shape, box shape, video를
## 입력으로으로 받아 추론 시작
device = select_device(device)
for path, im, im0s, vid_cap, s in dataset:
	im = torch.from_numpy(im).to(device)

## TensroFlow SavedModel의 경우 추론을 위해 torch_utils.py 파일의 52, 59라인에서 
## select_device 함수를 실행시켜 cuda device 사용
def select_device(device='', batch_size=0, newline=True):
    # device = 'cpu' or '0' or '0,1,2,3'
    s = f'YOLOv5 🚀 {git_describe() or file_update_date()} torch {torch.__version__} '  # string
    device = str(device).strip().lower().replace('cuda:', '')  # to string, 'cuda:0' to '0'
    cpu = device == 'cpu'
    if cpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # force torch.cuda.is_available() = False
    elif device:  # non-cpu device requested
        os.environ['CUDA_VISIBLE_DEVICES'] = device  # set environment variable - must be before assert is_available()
        assert torch.cuda.is_available() and torch.cuda.device_count() >= len(device.replace(',', '')), \\
            f"Invalid CUDA '--device {device}' requested, use '--device cpu' or pass valid CUDA device(s)"

    cuda = not cpu and torch.cuda.is_available()
    if cuda:
        devices = device.split(',') if device else '0'  # range(torch.cuda.device_count())  # i.e. 0,1,6,7
        n = len(devices)  # device count
        if n > 1 and batch_size > 0:  # check batch_size is divisible by device_count
            assert batch_size % n == 0, f'batch-size {batch_size} not multiple of GPU count {n}'
        space = ' ' * (len(s) + 1)
        for i, d in enumerate(devices):
            p = torch.cuda.get_device_properties(i)
            s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\\n"  # bytes to MB
    else:
        s += 'CPU\\n'

    if not newline:
        s = s.rstrip()
    LOGGER.info(s.encode().decode('ascii', 'ignore') if platform.system() == 'Windows' else s)  # emoji-safe
    return torch.device('cuda:0' if cuda else 'cpu')

## video_detection.py 파일의 73라인부터 video frame 하나당 추론 진행
## 위에서 batch size 1로 한 번 추론된 모델에 데이터셋을 입력으로 넣고 추론 후
## bounding box 처리 및 결과 계산
pred = model(im, augment=augment, visualize=False)

## 추론에 들어가는 데이터셋은 video_detection.py 파일의 40-42라인에서
## source에 비디오 데이터셋을 입력으로 받음
source = str(source)
is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
source = check_file(source)  # download

## 그리고 video_detection.py 파일의 54라인에서 LoadImages 함수를 실행하여 frame을 읽어옴
dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt)

## datasets.py 파일의 178, 186, 219라인에서 LoadIamges 클래스를 실행하여
## 입력 받은 비디오 파일을 frame으로 읽어옴
class LoadImages:
    # YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4`
    def __init__(self, path, img_size=640, stride=32, auto=True):
			elif os.path.isfile(p):
            files = [p]  # files
			
			...

			if self.video_flag[self.count]:
            # Read video
            self.mode = 'video'
            ret_val, img0 = self.cap.read()
            while not ret_val:
                self.count += 1
                self.cap.release()
                if self.count == self.nf:  # last video
                    raise StopIteration
                else:
                    path = self.files[self.count]
                    self.new_video(path)
                    ret_val, img0 = self.cap.read()

            self.frame += 1
            s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: '

## 그렇게 받아온 dataset을 기준으로 video_detection.py 파일의 45, 60, 64, 73라인에서 추론 시작
## pt는 torch 기반 모델이므로, TensorFlow SavedModel과 edgetpu 모델의 경우 
## batch size에 따라 warmup 모델을 사용하여 추론
model.warmup(imgsz=(1 if pt else bs, 3, *imgsz))  # warmup
for path, im, im0s, vid_cap, s in dataset:
        im = torch.from_numpy(im).to(device)
        im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
        im /= 255  # 0 - 255 to 0.0 - 1.0
        if len(im.shape) == 3:
            im = im[None]  # expand for batch dim

        # Inference
        iftime_avg_start = time.time()
        pred = model(im, augment=augment, visualize=False)

warmup 함수의 목적은 로드된 모델을 사용하여 batch size에 따른 추론을 한번 실행된 모델을 사용하게 함으로써, 첫번째 추론시 발생하는 coldstart 문제를 해결하기 위해 사용하는 것으로 생각됩니다.
image detection, video detection 모두 각 모델을 실행하기 위한 device 설정은 common.py, torch_utils.py 파일을 공통으로 사용하여 설정하고, 데이터셋 입력 형식도 모델에 맞게 지정합니다.
image detection, video detection 추론 모두 warmup 함수를 사용하여 batch size에 따라 로드된 모델을 한 번 추론 시켜 놓은 모델을 사용하여 데이터셋 만큼 추론을 진행합니다.
따라서 image, video detection의 경우 batch size에 따른 추론을 진행한 것이 맞다고 생각이 듭니다.
- 배칭 추론이 우리의 데이터셋에 대해 잘 이루어지고 있는지 잘 모르겠고, 코드 사용에 대해 정확하게 더 분석할 필요가 있음
다만, image detection의 경우 batch size에 따라 값이 모두 일정한것이 이상하여 추가로 image detection 코드에서 batch size 옵션 부분을 다시 살펴보고 있습니다.
```
for batch_i, (im, targets, paths, shapes) in enumerate(pbar):
```
- video detection 코드와 다른 부분은 이부분인데 여기서의 batch_i는 추론후 MAP 계산을 batch 단위로 예측하기 위해 사용된 옵션입니다.
- 실제로 batch size에 따른 추론은 video detection과 같은 방식으로 추론합니다.