Virtual walking piano using YOLO AI

Hi.

I'm really excited about this project. I've been working on it for a few days. Well, I say I'm working on it it's mostly been Gemini.using YOLO to create a virtual piano. I hope most people reading this have seen the film BIG with Tom Hanks.

The great thing is it doesn't require much hardware. My laptop has 8 gig of memory and it runs fairly well.

It uses YOLO to detect ankle position and if it's in a box then it plays a note.

I have experimented with a few other instruments including guitar, drums, harp and theremin but I think the piano works the best and is the most fun.

For the time being I'll have to draw the piano on the floor using either tape or chalk. Eventually I hope to get a projector.

I haven't found anyone on YouTube doing anything like this, but there must be somebody. It seems like such a obvious use of YOLO.

I switched my laptop over to Ubuntu just for this. I've been meaning to do it for a few years.

Hopefully everything you need to try this for yourself is below. Have fun!

I'll try to get some screenshots and video up in the next few days.

Frugal

The audio files:
https://github.com/RyanHuynh/PianoSamples

Prerequisites:
sudo apt update && sudo apt upgrade -y
sudo apt install python3-pip python3-venv libgl1-mesa-glx libpulse0 -y

setting up and running a virtual environment if you need to. I did:
python3 -m venv piano_env
source piano_env/bin/activate

pip install ultralytics opencv-python pygame

The code:

import cv2
import numpy as np
import pygame
import threading
from ultralytics import YOLO
import os
import json

# --- 1. SETUP & CONFIG ---
SAMPLE_DIR = "/home/simon/PianoSamples-master/Sample 2/wav"
CONFIG_FILE = "piano_config.txt"

print("\n--- 🎹 CHROMATIC PIANO ---")
octaves = int(input("How many octaves? (1, 2, or 3): ") or 2)
camera_pos = input("Camera position? ([F]ront or [B]ehind): ").lower() or 'f'
toe_mode = input("Enable Toe-Offset mode? (y/n): ").lower() == 'y'

# --- 2. NOTE & GEOMETRY LOGIC ---
CHROMATIC_PATTERN = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
note_sequence = []
for oct_val in range(2, 2 + octaves):
    for note in CHROMATIC_PATTERN:
        note_sequence.append(f"{note}{oct_val}")
if camera_pos == 'f': note_sequence = note_sequence[::-1]

def load_pts():
    if os.path.exists(CONFIG_FILE):
        try:
            with open(CONFIG_FILE, 'r') as f:
                return np.array(json.load(f), dtype=np.float32)
        except: pass
    return np.array([[150, 150], [450, 150], [550, 400], [50, 400]], np.float32)

def save_pts(current_pts):
    with open(CONFIG_FILE, 'w') as f:
        json.dump(current_pts.tolist(), f)

pts = load_pts()
selected_point = -1
headless = False

# --- 3. AUDIO ---
pygame.mixer.pre_init(44100, -16, 2, 1024)
pygame.init()
pygame.mixer.set_num_channels(32)

def load_note(name):
    file_name = name.replace("#", "s") + ".wav"
    path = os.path.join(SAMPLE_DIR, file_name)
    return pygame.mixer.Sound(path) if os.path.exists(path) else None

piano_keys = [load_note(n) for n in note_sequence]

def play_async(sound):
    if sound: threading.Thread(target=sound.play, daemon=True).start()

# --- 4. POLYGON CALCULATION ---
def get_key_poly(idx, current_pts):
    total_notes = len(note_sequence)
    is_sharp = "#" in note_sequence[idx]
    white_slots = octaves * 7
    slot_map = [0, 0.6, 1, 1.6, 2, 3, 3.6, 4, 4.6, 5, 5.7, 6]
    l_idx = idx if camera_pos == 'b' else (total_notes - 1 - idx)
    pos = (l_idx // 12 * 7) + slot_map[l_idx % 12]
    if camera_pos == 'f': pos = white_slots - pos - (0.6 if is_sharp else 1.0)
    w_s, w_e = pos / white_slots, (pos + (0.6 if is_sharp else 1.0)) / white_slots
    v_e = 0.45 if is_sharp else 1.0
    tL, tR = (1-w_s)*current_pts[0] + w_s*current_pts[1], (1-w_e)*current_pts[0] + w_e*current_pts[1]
    bL, bR = (1-w_s)*current_pts[3] + w_s*current_pts[2], (1-w_e)*current_pts[3] + w_e*current_pts[2]
    return np.array([tL, tR, (1-v_e)*tR + v_e*bR, (1-v_e)*tL + v_e*bL], np.int32)

# --- 5. MAIN LOOP ---
model = YOLO('yolo11n-pose.pt')
cap = cv2.VideoCapture(0)
cv2.namedWindow("Leonessa Piano")

def mouse_callback(event, x, y, flags, param):
    global pts, selected_point
    if headless: return
    if event == cv2.EVENT_LBUTTONDOWN:
        for i in range(4):
            if np.linalg.norm(pts[i] - [x, y]) < 25:
                selected_point = i; break
    elif event == cv2.EVENT_MOUSEMOVE and selected_point != -1:
        pts[selected_point] = [x, y]
    elif event == cv2.EVENT_LBUTTONUP:
        if selected_point != -1: save_pts(pts); selected_point = -1

cv2.setMouseCallback("Leonessa Piano", mouse_callback)
current_zones = [None, None]

try:
    while cap.isOpened():
        success, frame = cap.read()
        if not success: break
        frame = cv2.flip(frame, 1)
        display_frame = np.zeros_like(frame) if headless else frame.copy()
        
        results = model.predict(frame, imgsz=192, verbose=False)
        key_polys = [get_key_poly(i, pts) for i in range(len(note_sequence))]
        
        for i, poly in enumerate(key_polys):
            is_sharp = "#" in note_sequence[i]
            color = (255, 0, 255) if is_sharp else (220, 220, 220)
            if is_sharp: cv2.fillPoly(display_frame, [poly], (40, 40, 40))
            cv2.polylines(display_frame, [display_frame], True, color, 1) # Note: drawing on display_frame

        if not headless:
            for p in pts: cv2.circle(display_frame, tuple(p.astype(int)), 8, (0, 255, 255), -1)

        if results[0].keypoints is not None and len(results[0].keypoints.data) > 0:
            kpts, conf = results[0].keypoints.xy[0], results[0].keypoints.conf[0]
            active_this_frame = [None, None]
            
            for f_idx, tid in enumerate([15, 16]):
                if conf[tid] > 0.4:
                    ax, ay = kpts[tid][0].item(), kpts[tid][1].item()
                    
                    # TOE OFFSET CALCULATION
                    tx, ty = ax, ay
                    if toe_mode:
                        # Vector from top-center to ankle
                        top_mid = (pts[0] + pts[1]) / 2
                        # Push the detection point 20% further away from the top edge
                        ty = ay + (ay - top_mid[1]) * 0.15 
                    
                    found_idx = None
                    for i in range(len(note_sequence)):
                        if "#" in note_sequence[i] and cv2.pointPolygonTest(key_polys[i], (tx, ty), False) >= 0:
                            found_idx = i; break
                    if found_idx is None:
                        for i in range(len(note_sequence)):
                            if "#" not in note_sequence[i] and cv2.pointPolygonTest(key_polys[i], (tx, ty), False) >= 0:
                                found_idx = i; break
                    
                    active_this_frame[f_idx] = found_idx
                    if found_idx is not None:
                        cv2.circle(display_frame, (int(tx), int(ty)), 10, (0, 255, 0), -1)

            for i in range(2):
                if active_this_frame[i] is not None and active_this_frame[i] != current_zones[i]:
                    play_async(piano_keys[active_this_frame[i]])
                current_zones[i] = active_this_frame[i]

        cv2.imshow("Leonessa Piano", display_frame)
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'): break
        elif key == ord('h'): headless = not headless
finally:
    cap.release()
    cv2.destroyAllWindows()