Visual Studio 2022 Build Toolsとランタイムのインストール

torchreid のインストールに使用する.

管理者権限でコマンドプロンプトを起動(手順:Windowsキーまたはスタートメニュー > cmd と入力 > 右クリック > 「管理者として実行」)し、以下を実行する。管理者権限は、wingetの--scope machineオプションでシステム全体にソフトウェアをインストールするために必要である。


REM Visual Studio 2022 Build Toolsとランタイムのインストール
winget install --scope machine Microsoft.VisualStudio.2022.BuildTools Microsoft.VCRedist.2015+.x64
set VS_INSTALLER="C:\Program Files (x86)\Microsoft Visual Studio\Installer\setup.exe"
set VS_PATH="C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools"
REM C++開発ワークロードのインストール
%VS_INSTALLER% modify --installPath %VS_PATH% ^
--add Microsoft.VisualStudio.Workload.VCTools ^
--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ^
--add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
--includeRecommended --quiet --norestart
pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
"C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
set DISTUTILS_USE_SDK=1
pip install -U setuptools wheel
pip install transformers opencv-python numpy pillow torchreid を実行してください
# プログラム名: RT-DETRv2 + OSNet + Kalman + Persistent Gallery Person ReID
# 統合技術: RT-DETRv2 (検出) + OSNet (ReID) + 適応カルマンフィルタ + セッション継続学習
# 機能: 人物追跡・再識別・永続化ギャラリー管理
# 特徴技術および学習済モデルの利用制限: **学術研究目的での利用を推奨。商用利用時は各モデルの利用規約を確認すること。RT-DETRv2はApache 2.0ライセンス、OSNetはMITライセンス、OpenCVはApache 2.0ライセンス。必ず利用者自身で利用制限を確認すること。**
# 前準備:
#   - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 を実行してください
#   - pip install transformers opencv-python numpy pillow torchreid を実行してください

import cv2
import tkinter as tk
from tkinter import filedialog
import torch
import torch.nn.functional as F
import numpy as np
from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
from PIL import Image, ImageDraw, ImageFont
import time
import urllib.request
from datetime import datetime, timedelta
from collections import defaultdict
import torchreid
from torchreid.utils import FeatureExtractor
import pickle
import os
import gc

# GPU/CPU自動選択
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'デバイス: {str(device)}')
# GPU使用時の最適化
if device.type == 'cuda':
    torch.backends.cudnn.benchmark = True

# ===== 設定・定数管理 =====
MODEL_NAME = 'PekingU/rtdetr_v2_r50vd'
CONF_THRESH = 0.5
PERSON_CLASS_ID = 0
REID_SIMILARITY_THRESH = 0.65
MIN_PERSON_HEIGHT = 80

# 新設定項目
FEATURE_DB_PATH = 'person_gallery.pkl'
FEATURE_DECAY_DAYS = 30
KALMAN_NOISE_RATIO = 0.1
TRACK_HISTORY_SIZE = 10
IOU_THRESH = 0.4
MAX_AGE = 30

COLORS = [(0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255),
          (0, 255, 255), (128, 255, 0), (255, 128, 0), (128, 0, 255), (0, 128, 255)]

FONT_PATH = 'C:/Windows/Fonts/meiryo.ttc'
FONT_SIZE = 25

# ===== グローバル変数 =====
rt_detr_model = None
processor = None
feature_extractor = None

# データ管理
features_db = {}
session_features = {}
active_trackers = {}
person_tracks = defaultdict(list)

# 統計・カウンタ
frame_count = 0
next_id = 1
next_track_id = 1
results_log = []
detection_stats = defaultdict(int)

# ===== ヘルパー関数 =====
def bbox_to_center_size(bbox):
    """バウンディングボックス→中心点・サイズ変換"""
    x1, y1, x2, y2 = bbox
    cx, cy = (x1 + x2) / 2.0, (y1 + y2) / 2.0
    w, h = x2 - x1, y2 - y1
    return cx, cy, w, h

def center_size_to_bbox(cx, cy, w, h):
    """中心点・サイズ→バウンディングボックス変換"""
    x1, y1 = cx - w/2, cy - h/2
    x2, y2 = cx + w/2, cy + h/2
    return int(x1), int(y1), int(x2), int(y2)

def clear_gpu_memory(variables_to_delete=None):
    """統一GPU メモリクリア処理"""
    if variables_to_delete:
        for var in variables_to_delete:
            del var
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def iou(bbox1, bbox2):
    """2つのバウンディングボックス間のIoUを計算"""
    x1_inter = max(bbox1[0], bbox2[0])
    y1_inter = max(bbox1[1], bbox2[1])
    x2_inter = min(bbox1[2], bbox2[2])
    y2_inter = min(bbox1[3], bbox2[3])

    inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)

    area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
    area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
    union_area = area1 + area2 - inter_area

    if union_area == 0:
        return 0
    return inter_area / union_area

# ===== データベース管理関数 =====
def load_feature_database():
    """特徴量データベース読み込み"""
    global features_db, next_id

    if os.path.exists(FEATURE_DB_PATH):
        with open(FEATURE_DB_PATH, 'rb') as f:
            data = pickle.load(f)
            current_time = datetime.now()

            features_db = {
                pid: info for pid, info in data.items()
                if (current_time - info['timestamp']).days < FEATURE_DECAY_DAYS
            }

            if features_db:
                next_id = max(features_db.keys()) + 1

            print(f"ギャラリーDB読み込み完了: {len(features_db)}人")
    else:
        print("新規ギャラリーDB作成")

def save_feature_database():
    """特徴量データベース保存"""
    with open(FEATURE_DB_PATH, 'wb') as f:
        pickle.dump(features_db, f)
    print(f"ギャラリーDB保存完了: {len(features_db)}人")

def add_or_update_person_features(person_id, features, quality_score):
    """人物特徴量追加・更新"""
    current_time = datetime.now()

    if person_id in features_db:
        old_data = features_db[person_id]
        old_weight = min(0.8, old_data['update_count'] * 0.1)
        new_weight = quality_score * 0.2

        total_weight = old_weight + new_weight
        updated_features = (old_weight * old_data['features'] + new_weight * features) / total_weight

        normalized_features = F.normalize(updated_features, p=2, dim=1)

        features_db[person_id] = {
            'features': normalized_features.detach().cpu(),
            'timestamp': current_time,
            'update_count': old_data['update_count'] + 1
        }
    else:
        features_db[person_id] = {
            'features': features.detach().cpu(),
            'timestamp': current_time,
            'update_count': 1
        }

def get_weighted_features():
    """時間減衰重み付き特徴量取得"""
    current_time = datetime.now()
    weighted_features = {}

    for pid, data in features_db.items():
        days_old = (current_time - data['timestamp']).days
        decay_weight = max(0.3, 1.0 - (days_old / FEATURE_DECAY_DAYS))

        features_tensor = data['features'].to(device)

        weighted_features[pid] = {
            'features': features_tensor,
            'weight': decay_weight
        }

    return weighted_features

# ===== カルマンフィルタ関数 =====
def create_kalman_tracker(initial_bbox, track_id):
    """カルマンフィルタトラッカー作成"""
    kf = cv2.KalmanFilter(8, 4)

    kf.measurementMatrix = np.array([[1,0,0,0,0,0,0,0],
                                   [0,1,0,0,0,0,0,0],
                                   [0,0,1,0,0,0,0,0],
                                   [0,0,0,1,0,0,0,0]], dtype=np.float32)

    kf.transitionMatrix = np.array([[1,0,0,0,1,0,0,0],
                                  [0,1,0,0,0,1,0,0],
                                  [0,0,1,0,0,0,1,0],
                                  [0,0,0,1,0,0,0,1],
                                  [0,0,0,0,1,0,0,0],
                                  [0,0,0,0,0,1,0,0],
                                  [0,0,0,0,0,0,1,0],
                                  [0,0,0,0,0,0,0,1]], dtype=np.float32)

    kf.processNoiseCov = (KALMAN_NOISE_RATIO * np.eye(8)).astype(np.float32)
    kf.measurementNoiseCov = (10.0 * np.eye(4)).astype(np.float32)
    kf.errorCovPost = np.eye(8, dtype=np.float32)

    cx, cy, w, h = bbox_to_center_size(initial_bbox)
    kf.statePre = np.array([cx, cy, w, h, 0, 0, 0, 0], dtype=np.float32).reshape((8, 1))
    kf.statePost = np.array([cx, cy, w, h, 0, 0, 0, 0], dtype=np.float32).reshape((8, 1))

    tracker_info = {
        'kf': kf,
        'time_since_update': 0,
        'reid_confidence': 1.0,
        'last_bbox': initial_bbox,
        'initial_velocity_set': False
    }

    return tracker_info

def predict_kalman_tracker(tracker_info):
    """カルマン予測"""
    kf = tracker_info['kf']
    predicted_state = kf.predict()

    cx, cy, w, h = predicted_state[0, 0], predicted_state[1, 0], predicted_state[2, 0], predicted_state[3, 0]
    tracker_info['time_since_update'] += 1

    return center_size_to_bbox(cx, cy, w, h)

def update_kalman_tracker(tracker_info, bbox, reid_conf):
    """カルマン更新"""
    kf = tracker_info['kf']
    cx, cy, w, h = bbox_to_center_size(bbox)

    if not tracker_info['initial_velocity_set']:
        last_cx, last_cy, last_w, last_h = bbox_to_center_size(tracker_info['last_bbox'])

        vx, vy, vw, vh = cx - last_cx, cy - last_cy, w - last_w, h - last_h

        kf.statePost = np.array([cx, cy, w, h, vx, vy, vw, vh], dtype=np.float32).reshape((8, 1))
        kf.statePre = kf.statePost.copy()

        tracker_info['initial_velocity_set'] = True

    noise_factor = max(0.1, 1.0 - reid_conf)
    kf.measurementNoiseCov = (noise_factor * 10.0 * np.eye(4)).astype(np.float32)

    measurement = np.array([cx, cy, w, h], dtype=np.float32).reshape((4, 1))
    kf.correct(measurement)

    tracker_info['time_since_update'] = 0
    tracker_info['reid_confidence'] = reid_conf
    tracker_info['last_bbox'] = bbox

# ===== 特徴抽出・マッチング関数 =====
def extract_person_features(frame, bbox):
    """人物特徴量抽出"""
    x1, y1, x2, y2 = map(int, bbox)

    if (y2 - y1) < MIN_PERSON_HEIGHT:
        return None

    h, w = frame.shape[:2]
    x1, y1, x2, y2 = max(0, x1), max(0, y1), min(w, x2), min(h, y2)

    if x2 <= x1 or y2 <= y1:
        return None

    person_crop = frame[y1:y2, x1:x2]
    image_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)

    features = feature_extractor([image_rgb])

    normalized_features = F.normalize(features, p=2, dim=1)

    return normalized_features

def hybrid_person_matching(query_features):
    """ハイブリッドマッチング:セッション + 永続DB"""
    max_similarity = 0.0
    best_match_id = None
    match_source = 'none'

    for pid, feat in session_features.items():
        if feat.device != query_features.device:
            feat = feat.to(query_features.device)
        similarity = 1.2 * torch.mm(query_features, feat.t()).item()
        if similarity > max_similarity:
            max_similarity = similarity
            best_match_id = pid
            match_source = 'session'

    # 永続DB特徴量の時間減衰重み付きマッチング
    current_time = datetime.now()
    for pid, data in features_db.items():
        days_old = (current_time - data['timestamp']).days
        decay_weight = max(0.3, 1.0 - (days_old / FEATURE_DECAY_DAYS))

        features_tensor = data['features'].to(query_features.device)
        similarity = decay_weight * torch.mm(query_features, features_tensor.t()).item()

        if similarity > max_similarity:
            max_similarity = similarity
            best_match_id = pid
            match_source = 'database'

    return best_match_id, max_similarity, match_source

def process_person_detection(frame, bbox):
    """人物検出結果の処理 (単一検出用)"""
    global next_id

    query_features = extract_person_features(frame, bbox)
    if query_features is None:
        return None, 0.0, 'feature_error'

    person_id, similarity, match_source = hybrid_person_matching(query_features)

    if similarity >= REID_SIMILARITY_THRESH:
        session_features[person_id] = query_features.detach()
        quality_score = min(1.0, similarity * 1.2)
        add_or_update_person_features(person_id, query_features, quality_score)
    else:
        person_id = next_id
        next_id += 1
        session_features[person_id] = query_features.detach()
        add_or_update_person_features(person_id, query_features, 1.0)
        match_source = 'new'
        similarity = 1.0

    detection_stats[match_source] += 1
    return person_id, similarity, match_source

# ===== モデル初期化関数 =====
def initialize_models():
    """モデル初期化"""
    global rt_detr_model, processor, feature_extractor

    print('RT-DETRv2モデル初期化中...')
    rt_detr_model = RTDetrV2ForObjectDetection.from_pretrained(MODEL_NAME)
    processor = RTDetrImageProcessor.from_pretrained(MODEL_NAME)
    rt_detr_model.to(device)
    rt_detr_model.eval()
    print('RT-DETRv2初期化完了')

    print('OSNet ReIDモデル初期化中...')
    feature_extractor = FeatureExtractor(
        model_name='osnet_x1_0',
        model_path=None,
        device=device

)
    print('OSNet初期化完了')

    load_feature_database()

def video_frame_processing(frame):
    global frame_count, active_trackers, next_track_id
    current_time = time.time()
    frame_count += 1

    # RT-DETRv2検出
    frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = processor(images=frame_pil, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = rt_detr_model(**inputs)

    target_sizes = torch.tensor([frame.shape[:2]]).to(device)
    results = processor.post_process_object_detection(
        outputs, target_sizes=target_sizes, threshold=CONF_THRESH

)[0]

    # --- 追跡ロジック開始 ---

    # 1. 既存トラッカーの予測
    predicted_boxes = {}
    for track_id, tracker_info in active_trackers.items():
        predicted_boxes[track_id] = predict_kalman_tracker(tracker_info)

    # 2. 検出結果と予測結果のマッチング
    detections = []
    if len(results['labels']) > 0:
        boxes = results['boxes'].cpu().numpy()
        scores = results['scores'].cpu().numpy()
        labels = results['labels'].cpu().numpy()
        person_indices = labels == PERSON_CLASS_ID
        if np.any(person_indices):
            detections = list(zip(boxes[person_indices], scores[person_indices]))

    matched_indices = set()
    matches = []

    if len(predicted_boxes) > 0 and len(detections) > 0:
        iou_matrix = np.zeros((len(predicted_boxes), len(detections)))
        track_ids = list(predicted_boxes.keys())
        for t, track_id in enumerate(track_ids):
            for d, (det_box, _) in enumerate(detections):
                iou_matrix[t, d] = iou(predicted_boxes[track_id], det_box)

        # 最適なマッチを見つける (IoUが高い順)
        for _ in range(min(len(predicted_boxes), len(detections))):
            t, d = np.unravel_index(np.argmax(iou_matrix, axis=None), iou_matrix.shape)
            if iou_matrix[t, d] < IOU_THRESH:
                break
            track_id = track_ids[t]
            matches.append((track_id, detections[d]))
            matched_indices.add(d)
            iou_matrix[t, :] = -1 # このトラッカーはマッチ済み
            iou_matrix[:, d] = -1 # この検出はマッチ済み

    # 3. トラッカーの更新、削除、新規作成
    current_persons = []

    # 3a. マッチしたトラッカーを更新
    for track_id, (bbox, det_score) in matches:
        person_id, reid_conf, match_source = process_person_detection(frame, bbox)
        if person_id is not None:
            update_kalman_tracker(active_trackers[track_id], bbox, reid_conf)
            active_trackers[track_id]['person_id'] = person_id # IDを保持

            current_persons.append({
                'id': person_id, 'bbox': active_trackers[track_id]['last_bbox'],
                'detection_conf': float(det_score), 'reid_conf': float(reid_conf),
                'match_source': match_source, 'center': bbox_to_center_size(bbox)[:2]

})

    # 3b. マッチしなかったトラッカーを処理 (見失った or 削除)
    unmatched_track_ids = set(predicted_boxes.keys()) - set(m[0] for m in matches)
    for track_id in list(unmatched_track_ids):
        if active_trackers[track_id]['time_since_update'] > MAX_AGE:
            del active_trackers[track_id]

    # 3c. マッチしなかった検出結果から新規トラッカーを作成
    unmatched_detection_indices = set(range(len(detections))) - matched_indices
    for d in unmatched_detection_indices:
        bbox, det_score = detections[d]
        person_id, reid_conf, match_source = process_person_detection(frame, bbox)
        if person_id is not None:
            active_trackers[next_track_id] = create_kalman_tracker(bbox, next_track_id)
            active_trackers[next_track_id]['person_id'] = person_id

            current_persons.append({
                'id': person_id, 'bbox': bbox,
                'detection_conf': float(det_score), 'reid_conf': float(reid_conf),
                'match_source': match_source, 'center': bbox_to_center_size(bbox)[:2]

})
            next_track_id += 1

    # --- 追跡ロジック終了 ---

    # 軌跡とバウンディングボックスの描画
    for person in current_persons:
        person_id = person['id']
        x1, y1, x2, y2 = map(int, person['bbox'])
        color = COLORS[person_id % len(COLORS)]

        person_tracks[person_id].append(person['center'])
        if len(person_tracks[person_id]) > TRACK_HISTORY_SIZE:
            person_tracks[person_id].pop(0)

        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 3)

        if len(person_tracks[person_id]) > 1:
            points = np.array(person_tracks[person_id], np.int32).reshape((-1, 1, 2))
            cv2.polylines(frame, [points], isClosed=False, color=color, thickness=2)

    # テキスト情報(Pillow)
    img_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(img_pil)
    try:
        font = ImageFont.truetype(FONT_PATH, FONT_SIZE)
    except IOError:
        font = ImageFont.load_default()

    active_ids = len(set(p['id'] for p in current_persons))
    total_db = len(features_db)

    draw.text((10, 30), f'RT-DETRv2+OSNet+Kalman ({device})', font=font, fill=(255, 255, 255))
    draw.text((10, 60), f'Frame:{frame_count} | Active:{active_ids} | DB:{total_db}', font=font, fill=(0, 255, 255))
    draw.text((10, 90), 'q=終了 | Persistent Gallery Enabled', font=font, fill=(255, 255, 0))

    for person in current_persons:
        x1, y1, _, _ = map(int, person['bbox'])
        pid = person['id']
        source = person['match_source']
        label = f'ID:{pid}({source})'
        conf = f'D:{person["detection_conf"]:.0%} R:{person["reid_conf"]:.0%}'
        draw.text((x1, max(0, y1 - 60)), label, font=font, fill=(255, 255, 255))
        draw.text((x1, max(0, y1 - 30)), conf, font=font, fill=(255, 255, 255))

    frame = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

    result = f'Frame {frame_count}: {len(current_persons)}人検出'
    for person in current_persons:
        result += f' | ID{person["id"]}({person["match_source"]}:' + \
                  f'Det{person["detection_conf"]:.0%},ReID{person["reid_conf"]:.0%})'

    if frame_count % 50 == 0:
        clear_gpu_memory([inputs, outputs, results])

    return frame, result, current_time

# プログラム概要表示
print('=== RT-DETRv2 + OSNet + Kalman + Persistent Gallery Person ReID ===')
print('統合機能: 人物検出・再識別・適応追跡・永続学習')
print('技術: RT-DETRv2検出 + OSNet特徴抽出 + 適応カルマン + セッション継続')
print('操作: 0=動画ファイル, 1=カメラ, 2=サンプル動画 / qキー終了')
print()

# システム初期化
initialize_models()

print("0: 動画ファイル")
print("1: カメラ")
print("2: サンプル動画")

choice = input("選択: ")

if choice == '0':
    root = tk.Tk()
    root.withdraw()
    path = filedialog.askopenfilename()
    if not path:
        exit()
    cap = cv2.VideoCapture(path)
elif choice == '1':
    cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
    if not cap.isOpened():
        cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
else:
    SAMPLE_URL = 'https://raw.githubusercontent.com/opencv/opencv/master/samples/data/vtest.avi'
    SAMPLE_FILE = 'vtest.avi'
    urllib.request.urlretrieve(SAMPLE_URL, SAMPLE_FILE)
    cap = cv2.VideoCapture(SAMPLE_FILE)

if not cap.isOpened():
    print('動画ファイル・カメラを開けませんでした')
    exit()

# メイン処理
print('\n=== 動画処理開始 ===')
print('操作方法:')
print('  q キー: プログラム終了')
try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        MAIN_FUNC_DESC = "RT-DETRv2 + OSNet Person ReID"
        processed_frame, result, current_time = video_frame_processing(frame)
        cv2.imshow(MAIN_FUNC_DESC, processed_frame)
        if choice == '1':  # カメラの場合
            print(datetime.fromtimestamp(current_time).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3], result)
        else:  # 動画ファイルの場合
            print(frame_count, result)
        results_log.append(result)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
finally:
    print('\n=== プログラム終了 ===')
    cap.release()
    cv2.destroyAllWindows()

    save_feature_database()

    # 終了時の完全メモリクリア
    if torch.cuda.is_available():
        clear_gpu_memory([rt_detr_model, processor, feature_extractor])


    print(f'処理フレーム数: {frame_count}')
    print(f'最終DB人数: {len(features_db)}')
    for source, count in detection_stats.items():
        print(f'{source}マッチ: {count}件')

    if results_log:
        with open('result.txt', 'w', encoding='utf-8') as f:
            f.write('=== 結果 ===\n')
            f.write(f'処理フレーム数: {frame_count}\n')
            f.write(f'使用デバイス: {str(device).upper()}\n')
            if device.type == 'cuda':
                f.write(f'GPU: {torch.cuda.get_device_name(0)}\n')
            f.write('\n')
            f.write('\n'.join(results_log))
        print(f'\n処理結果をresult.txtに保存しました')"")"""""""""")'""})""]})
}