VGGT による多視点画像からの3次元再構成デモ

Python開発環境，ライブラリ類

ここでは、最低限の事前準備について説明する。機械学習や深層学習を行う場合は、NVIDIA CUDA、Visual Studio、Cursorなどを追加でインストールすると便利である。これらについては別ページ https://www.kkaneko.jp/cc/dev/aiassist.htmlで詳しく解説しているので、必要に応じて参照してください。

Python 3.12 のインストール

インストール済みの場合は実行不要。

管理者権限でコマンドプロンプトを起動（手順：Windowsキーまたはスタートメニュー > cmd と入力 > 右クリック > 「管理者として実行」）し、以下を実行する。管理者権限は、wingetの--scope machineオプションでシステム全体にソフトウェアをインストールするために必要である。

REM Python をシステム領域にインストール
winget install --scope machine --id Python.Python.3.12 -e --silent
REM Python のパス設定
set "PYTHON_PATH=C:\Program Files\Python312"
set "PYTHON_SCRIPTS_PATH=C:\Program Files\Python312\Scripts"
echo "%PATH%" | find /i "%PYTHON_PATH%" >nul
if errorlevel 1 setx PATH "%PATH%;%PYTHON_PATH%" /M >nul
echo "%PATH%" | find /i "%PYTHON_SCRIPTS_PATH%" >nul
if errorlevel 1 setx PATH "%PATH%;%PYTHON_SCRIPTS_PATH%" /M >nul

【関連する外部ページ】

Python の公式ページ: https://www.python.org/

AI エディタ Windsurf のインストール

Pythonプログラムの編集・実行には、AI エディタの利用を推奨する。ここでは，Windsurfのインストールを説明する。

管理者権限でコマンドプロンプトを起動（手順：Windowsキーまたはスタートメニュー > cmd と入力 > 右クリック > 「管理者として実行」）し、以下を実行して、Windsurfをシステム全体にインストールする。管理者権限は、wingetの--scope machineオプションでシステム全体にソフトウェアをインストールするために必要となる。

winget install --scope machine Codeium.Windsurf -e --silent

【関連する外部ページ】

Windsurf の公式ページ: https://windsurf.com/

必要なライブラリのインストール

コマンドプロンプトを管理者として実行（手順：Windowsキーまたはスタートメニュー > cmd と入力 > 右クリック > 「管理者として実行」）し、以下を実行する


pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
pip install numpy Pillow huggingface_hub
pip install git+https://github.com/facebookresearch/vggt.git
pip install open3d
pip install gsplat

VGGT による多視点画像からの3次元再構成デモプログラム

ソースコード


# VGGT 3D Scene Reconstruction Demo
#
# 【前準備・インストール】
# 以下のコマンドを実行してから本プログラムを実行してください：
#
# 1. PyTorchのインストール（CUDA対応推奨）
# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
#
# 2. 基本ライブラリのインストール
# pip install numpy Pillow huggingface_hub
#
# 3. VGGTライブラリのインストール
# pip install git+https://github.com/facebookresearch/vggt.git
#
# 4. 3D処理ライブラリのインストール
# pip install open3d
#
# 5. Gaussian Splatting用ライブラリ（オプション）
# pip install gsplat
#
# 【実行環境】
# - Python 3.8以上
# - CUDA対応GPU（推奨、CPUでも動作可能）
# - Windows/Linux/macOS対応
#
# 【注意事項】
# - 初回実行時はVGGTモデル（約1GB）のダウンロードが発生します
# - CUDA Compute Capability 8.0以上のGPUでbfloat16精度が使用可能です

import torch
import numpy as np
from pathlib import Path
import urllib.request
import subprocess
import struct
import sys
import open3d as o3d

from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map

def download_test_images():
    """COLMAP South Building多視点データセットをダウンロード"""
    print("COLMAP South Building多視点データセットをダウンロード中...")

    import zipfile

    # ETHのミラーサイトを使用（最も確実）
    url = "https://cvg-data.inf.ethz.ch/local-feature-evaluation-schoenberger2017/South-Building.zip"
    zip_path = "South-Building.zip"
    extract_dir = "south_building_extracted"

    print("South-Building.zipをダウンロード中...")
    urllib.request.urlretrieve(url, zip_path)
    print("ダウンロード完了")

    print("アーカイブを展開中...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

    # 画像ディレクトリを探索
    extract_path = Path(extract_dir)
    image_files = []

    # 一般的な画像フォルダ名を試行
    possible_dirs = [
        extract_path / "South-Building" / "images",
        extract_path / "images",
        extract_path / "South-Building",
        extract_path
    ]

    for img_dir in possible_dirs:
        if img_dir.exists():
            for ext in ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']:
                image_files.extend(list(img_dir.glob(f"*{ext}")))
            if image_files:
                break

    if not image_files:
        raise FileNotFoundError("画像ファイルが見つかりませんでした")

    # ファイル名でソート
    image_files = sorted(image_files)
    total_images = len(image_files)
    print(f"総画像数: {total_images}枚")

    # 連続しないように12枚を選択（1, 11, 21, 31, ...）
    selected_files = []
    for i in range(12):
        index = i * 10  # 10枚おきに選択
        if index < total_images:
            selected_files.append(str(image_files[index]))

    print(f"選択された画像: {len(selected_files)}枚")
    for i, filepath in enumerate(selected_files):
        print(f"  {i+1}: {Path(filepath).name}")

    return selected_files

def save_colmap_format(extrinsic, intrinsic, point_map_3d_np, image_names, output_dir):
    """COLMAP形式で結果を保存（Gaussian Splatting用）"""
    sparse_dir = output_dir / "sparse"
    sparse_dir.mkdir(exist_ok=True)

    num_cameras = len(image_names)
    height, width = point_map_3d_np.shape[1], point_map_3d_np.shape[2]

    # cameras.bin
    with open(sparse_dir / "cameras.bin", "wb") as f:
        f.write(struct.pack("<Q", num_cameras))
        for i in range(num_cameras):
            K = intrinsic[0, i].cpu().numpy()
            fx, fy = K[0, 0], K[1, 1]
            cx, cy = K[0, 2], K[1, 2]
            f.write(struct.pack("<I", i + 1))
            f.write(struct.pack("<I", 1))
            f.write(struct.pack("<Q", width))
            f.write(struct.pack("<Q", height))
            f.write(struct.pack("<dddd", fx, fy, cx, cy))

    # images.bin
    with open(sparse_dir / "images.bin", "wb") as f:
        f.write(struct.pack("<Q", num_cameras))
        for i in range(num_cameras):
            R = extrinsic[0, i, :3, :3].cpu().numpy()
            t = extrinsic[0, i, :3, 3].cpu().numpy()
            qw, qx, qy, qz = rotation_matrix_to_quaternion(R)
            f.write(struct.pack("<I", i + 1))
            f.write(struct.pack("<dddd", qw, qx, qy, qz))
            f.write(struct.pack("<ddd", t[0], t[1], t[2]))
            f.write(struct.pack("<I", i + 1))
            name = Path(image_names[i]).name
            f.write(struct.pack("<I", len(name)))
            f.write(name.encode('utf-8'))
            f.write(struct.pack("<Q", 0))

    # points3D.bin - 実際の3D点群データを保存
    points = point_map_3d_np.reshape(-1, 3)
    valid_mask = ~np.isnan(points).any(axis=1)
    valid_points = points[valid_mask]

    with open(sparse_dir / "points3D.bin", "wb") as f:
        f.write(struct.pack("<Q", len(valid_points)))
        for i, point in enumerate(valid_points):
            x, y, z = point
            f.write(struct.pack("<I", i + 1))
            f.write(struct.pack("<ddd", x, y, z))
            f.write(struct.pack("<BBB", 128, 128, 128))
            f.write(struct.pack("<d", 1.0))
            f.write(struct.pack("<Q", 0))

    print(f"COLMAP形式で保存: {sparse_dir}")

def rotation_matrix_to_quaternion(R):
    """回転行列をクォータニオンに変換"""
    trace = np.trace(R)
    if trace > 0:
        s = np.sqrt(trace + 1.0) * 2
        qw = 0.25 * s
        qx = (R[2, 1] - R[1, 2]) / s
        qy = (R[0, 2] - R[2, 0]) / s
        qz = (R[1, 0] - R[0, 1]) / s
    else:
        if R[0, 0] > R[1, 1] and R[0, 0] > R[2, 2]:
            s = np.sqrt(1.0 + R[0, 0] - R[1, 1] - R[2, 2]) * 2
            qw = (R[2, 1] - R[1, 2]) / s
            qx = 0.25 * s
            qy = (R[0, 1] + R[1, 0]) / s
            qz = (R[0, 2] + R[2, 0]) / s
        elif R[1, 1] > R[2, 2]:
            s = np.sqrt(1.0 + R[1, 1] - R[0, 0] - R[2, 2]) * 2
            qw = (R[0, 2] - R[2, 0]) / s
            qx = (R[0, 1] + R[1, 0]) / s
            qy = 0.25 * s
            qz = (R[1, 2] + R[2, 1]) / s
        else:
            s = np.sqrt(1.0 + R[2, 2] - R[0, 0] - R[1, 1]) * 2
            qw = (R[1, 0] - R[0, 1]) / s
            qx = (R[0, 2] + R[2, 0]) / s
            qy = (R[1, 2] + R[2, 1]) / s
            qz = 0.25 * s
    return qw, qx, qy, qz

def run_gaussian_splatting(output_dir):
    """Gaussian Splattingセットアップ"""
    print("Gaussian Splattingをセットアップ中...")
    subprocess.run([sys.executable, "-m", "pip", "install", "gsplat"], check=True)
    print(f"実行コマンド: python -m gsplat.examples.simple_trainer --data_dir {output_dir.absolute()}")

def apply_texture_to_mesh(mesh, images_np, extrinsic, intrinsic):
    """高品質なカラー付与：複数視点からの色を最適重み付きで統合"""
    print("高品質テクスチャマッピングを適用中...")

    vertices = np.asarray(mesh.vertices)
    vertex_normals = np.asarray(mesh.vertex_normals)
    vertex_colors = np.zeros((len(vertices), 3))

    num_views = images_np.shape[0]
    img_height, img_width = images_np.shape[2], images_np.shape[3]

    # 冗長計算を事前に実行
    R_matrices = []
    t_vectors = []
    K_matrices = []
    camera_positions = []

    for view_idx in range(num_views):
        R = extrinsic[0, view_idx, :3, :3].cpu().numpy()
        t = extrinsic[0, view_idx, :3, 3].cpu().numpy()
        K = intrinsic[0, view_idx].cpu().numpy()
        camera_pos = -R.T @ t

        R_matrices.append(R)
        t_vectors.append(t)
        K_matrices.append(K)
        camera_positions.append(camera_pos)

    for i, vertex in enumerate(vertices):
        best_color = np.array([0.5, 0.5, 0.5])
        max_weight = 0

        for view_idx in range(num_views):
            R = R_matrices[view_idx]
            t = t_vectors[view_idx]
            K = K_matrices[view_idx]
            camera_pos = camera_positions[view_idx]

            vertex_cam = R @ vertex + t

            if vertex_cam[2] <= 0:
                continue

            u = int((K[0, 0] * vertex_cam[0] / vertex_cam[2]) + K[0, 2])
            v = int((K[1, 1] * vertex_cam[1] / vertex_cam[2]) + K[1, 2])

            if 0 <= u < img_width and 0 <= v < img_height:
                view_dir = vertex - camera_pos
                view_dir = view_dir / np.linalg.norm(view_dir)

                normal = vertex_normals[i]
                angle_weight = max(0, np.dot(-view_dir, normal))
                distance_weight = 1.0 / (1.0 + np.linalg.norm(vertex_cam) * 0.1)
                weight = angle_weight * distance_weight

                if weight > max_weight:
                    max_weight = weight
                    r = images_np[view_idx, 0, v, u]
                    g = images_np[view_idx, 1, v, u]
                    b = images_np[view_idx, 2, v, u]
                    best_color = np.array([r, g, b])

        vertex_colors[i] = best_color

    mesh.vertex_colors = o3d.utility.Vector3dVector(vertex_colors)
    return mesh

def create_mesh_from_pointcloud(point_map_3d_np, output_dir, images, extrinsic, intrinsic):
    """点群からPoisson表面再構築でメッシュ生成"""
    print("Poisson表面再構築でメッシュを生成中...")

    points = point_map_3d_np.reshape(-1, 3)
    valid_points = points[~np.isnan(points).any(axis=1)]

    print(f"有効な点群数: {len(valid_points)}")

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(valid_points)
    pcd.estimate_normals()

    print("Poissonメッシュ生成中...")
    mesh, _ = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(pcd, depth=9)

    print("テクスチャマッピング適用中...")
    images_np = images.cpu().numpy()
    mesh = apply_texture_to_mesh(mesh, images_np, extrinsic, intrinsic)

    mesh_path = output_dir / "textured_mesh.ply"
    print(f"メッシュ保存中: {mesh_path}")
    o3d.io.write_triangle_mesh(str(mesh_path), mesh)
    print(f"テクスチャ付きメッシュ保存: {mesh_path}")

    print("3Dプレビューを表示中...")
    o3d.visualization.draw_geometries([mesh],
                                    window_name="テクスチャ付きメッシュプレビュー",
                                    width=1024, height=768)
    print("3Dプレビュー完了")

    return mesh_path

def main():
    print("VGGT 3D Scene Reconstruction Demo")
    print("=" * 40)
    print("操作方法:")
    print("  - テスト画像を使用するか、独自の画像フォルダを指定してください")
    print("  - 結果の保存は任意です")
    print("注意事項:")
    print("  - CUDA対応GPUが推奨されます")
    print("  - 初回実行時はモデルのダウンロードに時間がかかります")
    print("  - 2枚の画像でも高品質な深度推定が可能です")
    print()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"使用デバイス: {device}")

    if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
        dtype = torch.bfloat16
        print("精度: bfloat16")
    else:
        dtype = torch.float16
        print("精度: float16")

    use_test_images = input("テスト画像をダウンロードしますか？ (y/n): ").strip().lower() == 'y'

    if use_test_images:
        image_names = download_test_images()
    else:
        image_folder = input("画像フォルダのパスを入力（空白でデフォルト'images'）: ").strip()
        if not image_folder:
            image_folder = "images"

        image_path = Path(image_folder)
        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
        image_files = []
        for ext in image_extensions:
            image_files.extend(list(image_path.glob(f"*{ext}")))
            image_files.extend(list(image_path.glob(f"*{ext.upper()}")))

        image_names = [str(f) for f in sorted(image_files)]

    print(f"処理対象: {len(image_names)}枚の画像")
    for name in image_names:
        print(f"  - {name}")

    print("VGGTモデルを読み込み中...")
    model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
    print("モデル読み込み完了")

    print("画像の読み込みと前処理中...")
    images = load_and_preprocess_images(image_names).to(device)

    print("3D再構築実行中...")
    with torch.no_grad():
        with torch.cuda.amp.autocast(dtype=dtype):
            images_batch = images[None]

            aggregated_tokens_list, ps_idx = model.aggregator(images_batch)

            pose_enc = model.camera_head(aggregated_tokens_list)[-1]
            extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])

            depth_map, _ = model.depth_head(aggregated_tokens_list, images_batch, ps_idx)

            point_map, _ = model.point_head(aggregated_tokens_list, images_batch, ps_idx)

            point_map_3d = unproject_depth_map_to_point_map(
                depth_map.squeeze(0),
                extrinsic.squeeze(0),
                intrinsic.squeeze(0)
            )

            query_points = torch.FloatTensor([[100.0, 200.0], [60.72, 259.94]]).to(device)
            track_list, _, _ = model.track_head(
                aggregated_tokens_list, images_batch, ps_idx, query_points=query_points[None]
            )

    print("\n再構築完了")
    print(f"カメラ外部パラメータ形状: {extrinsic.shape}")
    print(f"カメラ内部パラメータ形状: {intrinsic.shape}")
    print(f"深度マップ形状: {depth_map.shape}")
    print(f"ポイントマップ形状: {point_map.shape}")
    print(f"3D点群形状: {point_map_3d.shape}")
    print(f"追跡点数: {len(track_list)}個のトラック")

    save_results = input("結果をファイルに保存しますか？ (y/n): ").strip().lower() == 'y'
    if save_results:
        output_dir = Path("vggt_output")
        output_dir.mkdir(exist_ok=True)

        print("結果保存中...")
        torch.save(extrinsic.cpu(), output_dir / "extrinsic.pt")
        torch.save(intrinsic.cpu(), output_dir / "intrinsic.pt")
        torch.save(depth_map.cpu(), output_dir / "depth_maps.pt")

        print("点群データをnumpy配列に変換中...")
        if isinstance(point_map_3d, torch.Tensor):
            point_map_3d_np = point_map_3d.cpu().numpy()
        else:
            point_map_3d_np = point_map_3d
        np.save(output_dir / "point_cloud_3d.npy", point_map_3d_np)

        print("COLMAP形式保存中...")
        save_colmap_format(extrinsic, intrinsic, point_map_3d_np, image_names, output_dir)

        print("メッシュ生成開始...")
        mesh_path = create_mesh_from_pointcloud(point_map_3d_np, output_dir, images, extrinsic, intrinsic)

        print("Gaussian Splattingセットアップ...")
        run_gaussian_splatting(output_dir)

        print(f"結果保存先: {output_dir}")
        print(f"生成テクスチャ付きメッシュ: {mesh_path}")
        print("3D再構築パイプライン完了")

    print("\nVGGTデモ実行完了")

if __name__ == "__main__":
    main()