Hi.
I'm really excited about this project. I've been working on it for a few days. Well, I say I'm working on it it's mostly been Gemini.using YOLO to create a virtual piano. I hope most people reading this have seen the film BIG with Tom Hanks.
The great thing is it doesn't require much hardware. My laptop has 8 gig of memory and it runs fairly well.
It uses YOLO to detect ankle position and if it's in a box then it plays a note.
I have experimented with a few other instruments including guitar, drums, harp and theremin but I think the piano works the best and is the most fun.
For the time being I'll have to draw the piano on the floor using either tape or chalk. Eventually I hope to get a projector.
I haven't found anyone on YouTube doing anything like this, but there must be somebody. It seems like such a obvious use of YOLO.
I switched my laptop over to Ubuntu just for this. I've been meaning to do it for a few years.
Hopefully everything you need to try this for yourself is below. Have fun!
I'll try to get some screenshots and video up in the next few days.
Frugal
The audio files:
https://github.com/RyanHuynh/PianoSamples
Prerequisites:
sudo apt update && sudo apt upgrade -y
sudo apt install python3-pip python3-venv libgl1-mesa-glx libpulse0 -y
setting up and running a virtual environment if you need to. I did:
python3 -m venv piano_env
source piano_env/bin/activate
pip install ultralytics opencv-python pygame
The code:
import cv2
import numpy as np
import pygame
import threading
from ultralytics import YOLO
import os
import json
# --- 1. SETUP & CONFIG ---
SAMPLE_DIR = "/home/simon/PianoSamples-master/Sample 2/wav"
CONFIG_FILE = "piano_config.txt"
print("\n--- 🎹 CHROMATIC PIANO ---")
octaves = int(input("How many octaves? (1, 2, or 3): ") or 2)
camera_pos = input("Camera position? ([F]ront or [B]ehind): ").lower() or 'f'
toe_mode = input("Enable Toe-Offset mode? (y/n): ").lower() == 'y'
# --- 2. NOTE & GEOMETRY LOGIC ---
CHROMATIC_PATTERN = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
note_sequence = []
for oct_val in range(2, 2 + octaves):
for note in CHROMATIC_PATTERN:
note_sequence.append(f"{note}{oct_val}")
if camera_pos == 'f': note_sequence = note_sequence[::-1]
def load_pts():
if os.path.exists(CONFIG_FILE):
try:
with open(CONFIG_FILE, 'r') as f:
return np.array(json.load(f), dtype=np.float32)
except: pass
return np.array([[150, 150], [450, 150], [550, 400], [50, 400]], np.float32)
def save_pts(current_pts):
with open(CONFIG_FILE, 'w') as f:
json.dump(current_pts.tolist(), f)
pts = load_pts()
selected_point = -1
headless = False
# --- 3. AUDIO ---
pygame.mixer.pre_init(44100, -16, 2, 1024)
pygame.init()
pygame.mixer.set_num_channels(32)
def load_note(name):
file_name = name.replace("#", "s") + ".wav"
path = os.path.join(SAMPLE_DIR, file_name)
return pygame.mixer.Sound(path) if os.path.exists(path) else None
piano_keys = [load_note(n) for n in note_sequence]
def play_async(sound):
if sound: threading.Thread(target=sound.play, daemon=True).start()
# --- 4. POLYGON CALCULATION ---
def get_key_poly(idx, current_pts):
total_notes = len(note_sequence)
is_sharp = "#" in note_sequence[idx]
white_slots = octaves * 7
slot_map = [0, 0.6, 1, 1.6, 2, 3, 3.6, 4, 4.6, 5, 5.7, 6]
l_idx = idx if camera_pos == 'b' else (total_notes - 1 - idx)
pos = (l_idx // 12 * 7) + slot_map[l_idx % 12]
if camera_pos == 'f': pos = white_slots - pos - (0.6 if is_sharp else 1.0)
w_s, w_e = pos / white_slots, (pos + (0.6 if is_sharp else 1.0)) / white_slots
v_e = 0.45 if is_sharp else 1.0
tL, tR = (1-w_s)*current_pts[0] + w_s*current_pts[1], (1-w_e)*current_pts[0] + w_e*current_pts[1]
bL, bR = (1-w_s)*current_pts[3] + w_s*current_pts[2], (1-w_e)*current_pts[3] + w_e*current_pts[2]
return np.array([tL, tR, (1-v_e)*tR + v_e*bR, (1-v_e)*tL + v_e*bL], np.int32)
# --- 5. MAIN LOOP ---
model = YOLO('yolo11n-pose.pt')
cap = cv2.VideoCapture(0)
cv2.namedWindow("Leonessa Piano")
def mouse_callback(event, x, y, flags, param):
global pts, selected_point
if headless: return
if event == cv2.EVENT_LBUTTONDOWN:
for i in range(4):
if np.linalg.norm(pts[i] - [x, y]) < 25:
selected_point = i; break
elif event == cv2.EVENT_MOUSEMOVE and selected_point != -1:
pts[selected_point] = [x, y]
elif event == cv2.EVENT_LBUTTONUP:
if selected_point != -1: save_pts(pts); selected_point = -1
cv2.setMouseCallback("Leonessa Piano", mouse_callback)
current_zones = [None, None]
try:
while cap.isOpened():
success, frame = cap.read()
if not success: break
frame = cv2.flip(frame, 1)
display_frame = np.zeros_like(frame) if headless else frame.copy()
results = model.predict(frame, imgsz=192, verbose=False)
key_polys = [get_key_poly(i, pts) for i in range(len(note_sequence))]
for i, poly in enumerate(key_polys):
is_sharp = "#" in note_sequence[i]
color = (255, 0, 255) if is_sharp else (220, 220, 220)
if is_sharp: cv2.fillPoly(display_frame, [poly], (40, 40, 40))
cv2.polylines(display_frame, [display_frame], True, color, 1) # Note: drawing on display_frame
if not headless:
for p in pts: cv2.circle(display_frame, tuple(p.astype(int)), 8, (0, 255, 255), -1)
if results[0].keypoints is not None and len(results[0].keypoints.data) > 0:
kpts, conf = results[0].keypoints.xy[0], results[0].keypoints.conf[0]
active_this_frame = [None, None]
for f_idx, tid in enumerate([15, 16]):
if conf[tid] > 0.4:
ax, ay = kpts[tid][0].item(), kpts[tid][1].item()
# TOE OFFSET CALCULATION
tx, ty = ax, ay
if toe_mode:
# Vector from top-center to ankle
top_mid = (pts[0] + pts[1]) / 2
# Push the detection point 20% further away from the top edge
ty = ay + (ay - top_mid[1]) * 0.15
found_idx = None
for i in range(len(note_sequence)):
if "#" in note_sequence[i] and cv2.pointPolygonTest(key_polys[i], (tx, ty), False) >= 0:
found_idx = i; break
if found_idx is None:
for i in range(len(note_sequence)):
if "#" not in note_sequence[i] and cv2.pointPolygonTest(key_polys[i], (tx, ty), False) >= 0:
found_idx = i; break
active_this_frame[f_idx] = found_idx
if found_idx is not None:
cv2.circle(display_frame, (int(tx), int(ty)), 10, (0, 255, 0), -1)
for i in range(2):
if active_this_frame[i] is not None and active_this_frame[i] != current_zones[i]:
play_async(piano_keys[active_this_frame[i]])
current_zones[i] = active_this_frame[i]
cv2.imshow("Leonessa Piano", display_frame)
key = cv2.waitKey(1) & 0xFF
if key == ord('q'): break
elif key == ord('h'): headless = not headless
finally:
cap.release()
cv2.destroyAllWindows()