Skip to content

Commit 2650f8f

Browse files
committed
Smoother motion BPM
1 parent e1f2349 commit 2650f8f

File tree

10 files changed

+78
-80
lines changed

10 files changed

+78
-80
lines changed

bin/annotate_beats.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import librosa
99
import sys
1010
import argparse
11-
from headbang.util import load_wav
11+
from headbang.util import load_wav, overlay_clicks
1212

1313

1414
class Player:
@@ -111,15 +111,9 @@ def main():
111111

112112
player.terminate_processing()
113113

114-
x = load_wav(args.wav_in, stereo=True)
115-
116114
print("annotated beat locations: {0}".format(beats))
117-
beat_clicks = librosa.clicks(beats, sr=44100, length=len(x))
118-
119-
if x.shape[0] == 1:
120-
beat_clicks = numpy.column_stack((beat_clicks, beat_clicks))
121-
122-
beat_waveform = (x + beat_clicks).astype(numpy.single)
115+
x = load_wav(args.wav_in, stereo=True)
116+
beat_waveform = overlay_clicks(x, beats)
123117

124118
print("Writing outputs with clicks to {0}".format(args.beat_wav_out))
125119
write_wave_file(beat_waveform, args.beat_wav_out, sample_rate=44100)

bin/beat_track.py

+24-25
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from madmom.io.audio import write_wave_file
1111

1212
from headbang import HeadbangBeatTracker
13-
from headbang.util import load_wav
13+
from headbang.util import load_wav, overlay_clicks
1414
from headbang.params import DEFAULTS
1515

1616

@@ -26,27 +26,27 @@ def main():
2626
"--algorithms",
2727
type=str,
2828
default=DEFAULTS["algorithms"],
29-
help="List of beat tracking algorithms to apply",
29+
help="List of beat tracking algorithms to apply (default=%(default)s)",
3030
)
3131
beat_args.add_argument(
3232
"--onset-align-threshold-s",
3333
type=float,
3434
default=DEFAULTS["onset_align_threshold_s"],
35-
help="How close beats should align with onsets (in seconds)",
35+
help="How close beats should align with onsets (in seconds) (default=%(default)s)",
3636
)
3737

3838
onset_args = parser.add_argument_group("onsets arguments")
3939
onset_args.add_argument(
4040
"--max-no-beats",
4141
type=float,
4242
default=DEFAULTS["max_no_beats"],
43-
help="Segments with missing beats to substitute onsets",
43+
help="Segments with missing beats to substitute onsets (default=%(default)s)",
4444
)
4545
onset_args.add_argument(
4646
"--onset-near-threshold-s",
4747
type=float,
4848
default=DEFAULTS["onset_near_threshold_s"],
49-
help="How close onsets should be (in seconds) when supplementing onset information",
49+
help="How close onsets should be (in seconds) when supplementing onset information (default=%(default)s)",
5050
)
5151
onset_args.add_argument(
5252
"--onset-silence-threshold",
@@ -59,7 +59,7 @@ def main():
5959
"--n-pool",
6060
type=int,
6161
default=multiprocessing.cpu_count() - 1,
62-
help="How many threads to use in multiprocessing pool",
62+
help="How many threads to use in multiprocessing pool (default=%(default)s)",
6363
)
6464
parser.add_argument(
6565
"--show-plots",
@@ -77,62 +77,68 @@ def main():
7777
help="disable transient shaping, only use percussive separation",
7878
)
7979
parser.add_argument(
80-
"--beats-out", type=str, default="", help="output beats txt file"
80+
"--beats-out",
81+
type=str,
82+
default="",
83+
help="output beats txt file (default=%(default)s)",
8184
)
8285

8386
hpss_args = parser.add_argument_group("hpss arguments")
8487
hpss_args.add_argument(
8588
"--harmonic-margin",
8689
type=float,
8790
default=DEFAULTS["harmonic_margin"],
88-
help="Separation margin for HPSS harmonic iteration",
91+
help="Separation margin for HPSS harmonic iteration (default=%(default)s)",
8992
)
9093
hpss_args.add_argument(
9194
"--harmonic-frame",
9295
type=int,
9396
default=DEFAULTS["harmonic_frame"],
94-
help="T-F/frame size for HPSS harmonic iteration",
97+
help="T-F/frame size for HPSS harmonic iteration (default=%(default)s)",
9598
)
9699
hpss_args.add_argument(
97100
"--percussive-margin",
98101
type=float,
99102
default=DEFAULTS["percussive_margin"],
100-
help="Separation margin for HPSS percussive iteration",
103+
help="Separation margin for HPSS percussive iteration (default=%(default)s)",
101104
)
102105
hpss_args.add_argument(
103106
"--percussive-frame",
104107
type=int,
105108
default=DEFAULTS["percussive_frame"],
106-
help="T-F/frame size for HPSS percussive iteration",
109+
help="T-F/frame size for HPSS percussive iteration (default=%(default)s)",
107110
)
108111

109112
tshaper_args = parser.add_argument_group("multiband transient shaper arguments")
110113
tshaper_args.add_argument(
111114
"--fast-attack-ms",
112115
type=int,
113116
default=DEFAULTS["fast_attack_ms"],
114-
help="Fast attack (ms)",
117+
help="Fast attack (ms) (default=%(default)s)",
115118
)
116119
tshaper_args.add_argument(
117120
"--slow-attack-ms",
118121
type=int,
119122
default=DEFAULTS["slow_attack_ms"],
120-
help="Slow attack (ms)",
123+
help="Slow attack (ms) (default=%(default)s)",
121124
)
122125
tshaper_args.add_argument(
123-
"--release-ms", type=int, default=DEFAULTS["release_ms"], help="Release (ms)"
126+
"--release-ms",
127+
type=int,
128+
default=DEFAULTS["release_ms"],
129+
help="Release (ms) (default=%(default)s)",
124130
)
125131
tshaper_args.add_argument(
126132
"--power-memory-ms",
127133
type=int,
128134
default=DEFAULTS["power_memory_ms"],
129-
help="Power filter memory (ms)",
135+
help="Power filter memory (ms) (default=%(default)s)",
130136
)
131137
tshaper_args.add_argument(
132138
"--filter-order",
133139
type=int,
134140
default=DEFAULTS["filter_order"],
135-
help="Bandpass (butter) filter order",
141+
help="Bandpass (butter) filter order (default=%(default)s)",
136142
)
137143

138144
parser.add_argument("wav_in", help="input wav file")
@@ -180,18 +186,11 @@ def main():
180186
f.write(f"{b}\n")
181187

182188
print("Overlaying clicks at beat locations")
183-
clicks = librosa.clicks(beats, sr=44100, length=len(x))
184-
185-
# if stereo, write it that way for higher quality
186189
x_stereo = load_wav(args.wav_in, stereo=True)
187-
188-
if len(x_stereo.shape) > 1 and x_stereo.shape[1] == 2:
189-
clicks = numpy.column_stack((clicks, clicks)) # convert to stereo
190-
191-
final_waveform = (x_stereo + clicks).astype(numpy.single)
190+
x_with_clicks = overlay_clicks(x_stereo, beats)
192191

193192
print("Writing output with clicks to {0}".format(args.wav_out))
194-
write_wave_file(final_waveform, args.wav_out, sample_rate=44100)
193+
write_wave_file(x_with_clicks, args.wav_out, sample_rate=44100)
195194

196195
if args.show_plots:
197196
print("Displaying plots")

bin/reference_beats.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy
77
import librosa
88
from madmom.io.audio import load_audio_file, write_wave_file
9-
from headbang.util import load_wav
9+
from headbang.util import load_wav, overlay_clicks
1010
from headbang.beattrack import apply_single_beat_tracker, algo_names
1111
import madmom
1212

@@ -19,10 +19,16 @@ def main():
1919
)
2020

2121
parser.add_argument(
22-
"--algorithm", type=int, default=1, help="which single algorithm to use"
22+
"--algorithm",
23+
type=int,
24+
default=1,
25+
help="which single algorithm to use (default=%(default)s)",
2326
)
2427
parser.add_argument(
25-
"--filter-order", type=int, default=2, help="butter filter order"
28+
"--filter-order",
29+
type=int,
30+
default=2,
31+
help="butter filter order (default=%(default)s)",
2632
)
2733
parser.add_argument("wav_in", help="input wav file")
2834
parser.add_argument("beat_wav_out", help="output beat wav file")
@@ -36,10 +42,7 @@ def main():
3642
beat_times = apply_single_beat_tracker(x, args.algorithm)
3743

3844
print("Overlaying clicks at beat locations")
39-
40-
beat_clicks = librosa.clicks(beat_times, sr=44100, length=len(x))
41-
42-
beat_waveform = (x + beat_clicks).astype(numpy.single)
45+
beat_waveform = overlay_clicks(x, beat_times)
4346

4447
print("Writing outputs with clicks to {0}".format(args.beat_wav_out))
4548
write_wave_file(beat_waveform, args.beat_wav_out, sample_rate=44100)

docs/index.md

+3-13
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ The HeadbangBeatTracker first gathers beats using the ConsensusBeatTracker, and
9595

9696
![percussive_hpss](percussive_hpss.png)
9797

98-
The algorithm used is one based on median filtering the spectrogram, originally described in [[9]](#9), and further improved with iterative algorithm in [[10]](#10). I use the [librosa implementation](https://librosa.org/doc/0.8.0/generated/librosa.decompose.hpss.html), which has a good visualization of the effect.
98+
The algorithm used is one based on median filtering the spectrogram, originally described in [[9]](#9), and further improved with an iterative algorithm in [[10]](#10). I use the [librosa implementation](https://librosa.org/doc/0.8.0/generated/librosa.decompose.hpss.html), which has a good visualization of the effect.
9999

100100
The parameters of the HPSS can be modified (but I don't find it changes the results significantly):
101101
* `harmonic_frame=16384` (a larger frame size in the first iteration gives us higher frequency resolution which helps separate pitched, harmonic components)
@@ -197,11 +197,6 @@ Here's a table of some interesting outputs of headbang's algorithms:
197197
<td>{% include embed-audio.html src="themixture_dbn.wav" %}</td>
198198
<td>{% include embed-audio.html src="themixture_hbt.wav" %}</td>
199199
</tr>
200-
<tr>
201-
<td><a href="https://www.youtube.com/watch?v=Pru_5HW9Ofg">Vitalism - Luxata</a></td>
202-
<td>{% include embed-audio.html src="luxata_dbn.wav" %}</td>
203-
<td>{% include embed-audio.html src="luxata_hbt.wav" %}</td>
204-
</tr>
205200
<tr>
206201
<td><a href="https://www.youtube.com/watch?v=8niG0ta4jZs">Anup Sastry - Origin</a></td>
207202
<td>{% include embed-audio.html src="origin_dbn.wav" %}</td>
@@ -394,11 +389,6 @@ for bpm in bpms:
394389

395390
Note the off-beat head bops at first, which transition to being on-beat.
396391

397-
[Keith Ape - It G Ma](https://www.youtube.com/watch?v=Ls9QJEE0Drw):
398-
{% include embed-video.html src="it_g_ma_short.mp4" %}
399-
400-
Although all the talk is about metal so far, it works well on the above rap song.
401-
402392
Here's a clip from drummer [Anup Sastry - Titan](https://www.youtube.com/watch?v=Y82rls0yoAM), where the motion is tracked on the left and right arms (instead of the head and neck):
403393
{% include embed-video.html src="anupsastry_short.mp4" %}
404394

@@ -408,10 +398,10 @@ Here's a clip from drummer [Anup Sastry - Titan](https://www.youtube.com/watch?v
408398

409399
Note the two-step process:
410400
1. First, the video is stepped through frame by frame to apply OpenPose pose detection
411-
2. The resultant frames (with drawn keypoints) are written to a tmp mp4 file
401+
2. The resultant frames (with drawn keypoints) are written to a temporary mp4 file
412402
3. y coordinates are accumulated per-frame to track motion throughout the video and pick peaks
413403
4. Beats are computed from the audio
414-
5. The tmp mp4 file is loaded frame by frame, bop/beat/tempo values are drawn on the respective frames, and the result is written to the final output file
404+
5. The temporary mp4 file is loaded frame by frame, bop/beat/tempo values are drawn on the respective frames, and the result is written to the final output file
415405

416406
The two-pass design was chosen out of necessity; keeping all of the frames of the video in-memory while performing all of the processing was leading to huge memory usage (32+GB) for long videos.
417407

docs/luxata_dbn.wav

-5.05 MB
Binary file not shown.

docs/luxata_hbt.wav

-10.1 MB
Binary file not shown.

headbang-hud/headbang-hud.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,24 @@ def bpm_from_beats(beats):
104104
return 60 / beat_step
105105

106106

107+
def bops_realistic_smoothing(bops, min_spacing):
108+
return bops[numpy.where(numpy.diff(bops) > min_spacing)[0]]
109+
110+
107111
def main():
108112
parser = argparse.ArgumentParser(
109113
description="Track human pose in videos with music alongside groove metrics and beat tracking"
110114
)
111115
parser.add_argument(
112-
"--custom-keypoints", type=str, help="Override the default face/neck keypoints"
116+
"--custom-keypoints",
117+
type=str,
118+
help="Override the default face/neck keypoints (default=%(default)s)",
119+
)
120+
parser.add_argument(
121+
"--minimum-bop-spacing",
122+
type=float,
123+
default=0.2,
124+
help="Minimum spacing (in seconds) between bops to filter out implausible events (default=%(default)s)",
113125
)
114126
parser.add_argument("mp4_in", type=str, help="mp4 file to process")
115127
parser.add_argument("mp4_out", type=str, help="mp4 output path")
@@ -225,7 +237,7 @@ def process_first_pass(*args, **kwargs):
225237

226238
print("Marking beat and head bop positions on output frames")
227239

228-
frame_history = 3 # consider this many seconds of history for bpm computation
240+
frame_history = 5 # consider this many seconds of history for bpm computation
229241

230242
all_beats_bpm = 0
231243
strong_beats_bpm = 0
@@ -255,10 +267,13 @@ def process_second_pass(get_frame_fn, frame_time):
255267
)
256268
]
257269

270+
# keep a running history of bops
258271
bop_history = bop_locations[
259272
numpy.where((bop_locations >= frame_min) & (bop_locations <= frame_max))
260273
]
261274

275+
bop_history = bops_realistic_smoothing(bop_history, args.minimum_bop_spacing)
276+
262277
all_beats_bpm_tmp = bpm_from_beats(all_beat_history)
263278
bop_bpm_tmp = bpm_from_beats(bop_history)
264279

headbang/util.py

+10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from madmom.io.audio import load_audio_file
22
import numpy
3+
import librosa
34

45

56
def load_wav(wav_in, stereo=False):
@@ -17,3 +18,12 @@ def load_wav(wav_in, stereo=False):
1718
x /= numpy.max(numpy.abs(x))
1819

1920
return x
21+
22+
23+
def overlay_clicks(x, beats):
24+
clicks = librosa.clicks(beats, sr=44100, length=len(x))
25+
26+
if len(x.shape) > 1 and x.shape[1] == 2:
27+
clicks = numpy.column_stack((clicks, clicks)) # convert to stereo
28+
29+
return (x + clicks).astype(numpy.single)

misc/latex-deliverables/citations.bib

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@website{mirex06,
1+
@misc{mirex06,
22
title={2006:Audio Beat Tracking},
33
url={https://www.music-ir.org/mirex/wiki/2006:Audio_Beat_Tracking},
44
journal={MIREX Wiki}}

misc/latex-deliverables/proposal.tex

+10-23
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
\documentclass[letter,12pt]{report}
2-
\setlength{\parindent}{0pt}
2+
%\setlength{\parindent}{0pt}
33
\usepackage[left=2cm, right=2cm, top=2cm, bottom=2cm]{geometry}
44
\usepackage[shortlabels]{enumitem}
55
\usepackage{graphicx}
@@ -24,12 +24,7 @@
2424
\renewcommand{\topfraction}{0.85}
2525
\renewcommand{\textfraction}{0.1}
2626
\renewcommand{\floatpagefraction}{0.75}
27-
\usepackage[
28-
%backend=biber,
29-
natbib=true,
30-
style=numeric,
31-
sorting=none
32-
]{biblatex}
27+
\usepackage[backend=biber,authordate]{biblatex-chicago}
3328
\addbibresource{citations.bib}
3429
\usepackage{titlesec}
3530

@@ -38,31 +33,23 @@
3833

3934
\begin{document}
4035

41-
\Large{\textbf{headbang.py}}\\
42-
\large{Final Project Proposal. MUMT 621, April 06, 2021}\\
36+
\noindent\Large{\textbf{headbang.py}}\\
37+
\large{Final project proposal. MUMT 621, April 06, 2021}\\
4338
\large{Sevag Hanssian, 260398537}
4439

45-
\hrulefill
40+
\noindent\hrulefill
4641

4742
\vspace{2em}
4843

49-
Beat tracking is a rich field of music information retrieval (MIR). The audio beat tracking task has been a part of MIREX since 2006 \cite{mirex06}, and receives submissions every year. Most recently, state of the art results have been achieved by \cite{bock1} and \cite{bock2}, who have also released their algorithms in the madmom Python library \cite{madmom}.
44+
Beat tracking is a rich field of music information retrieval (MIR). The audio beat tracking task has been a part of MIREX since 2006 (\cite{mirex06}), and receives submissions every year. Most recently, \textcite{bock1, bock2} have achieved state of the art results, and have released their algorithms in the open-source madmom Python library (\cite{madmom}).
5045

51-
\vspace{1em}
46+
The beat tracking algorithms in MIREX are evaluated against diverse and challenging beat tracking datasets (\cite{beatmeta}). However, in my personal experiments on my preferred genres of music (mostly rhythmically-complex progressive metal, e.g., \cite{meshuggah, periphery}), I noticed that in several cases the beat locations output by the best algorithms were not correct.
5247

53-
\qquad The beat tracking algorithms in MIREX are evaluated against diverse and challenging beat tracking datasets (\cite{beatmeta}). However, in my personal experiments on my preferred genres of music (mostly rhythmically-complex progressive metal, e.g., \cite{meshuggah}, \cite{periphery}), I noticed that in several cases the beat locations output by the best algorithms were not correct.
48+
For the first goal of my final project, I propose to explore various beat tracking algorithms and pre-processing techniques to demonstrate improved beat results in progressive metal songs. The name of the project is ``headbang.py''; the ``.py'' suffix is because it will be a code project written in Python, and ``headbang'' refers to the act of headbanging, where metal musicians or fans violently move their head up and down to the beat of a metal song.
5449

55-
\vspace{1em}
50+
There are recent papers which combine MIR tasks with 2D pose estimation to associate human dance motion with musical beats (\cite{pose1}, \cite{pose2}). For the second goal of headbang.py, I propose to analyze headbanging motion in metal videos with the OpenPose 2D human pose estimation library. The results of the headbanging motion analysis can be displayed alongside the results of beat tracking, to potentially reveal some information about what drives the urge to headbang.
5651

57-
\qquad For the first goal of my final project, I propose to explore various beat tracking algorithms and pre-processing techniques to demonstrate improved beat results in progressive metal songs. The name of the project is ``headbang.py''; the ``.py'' suffix is because it will be a code project written in Python, and ``headbang'' refers to the act of headbanging, where metal musicians or fans violently move their head up and down to the beat of a metal song.
58-
59-
\vspace{1em}
60-
61-
\qquad There are recent papers which combine MIR tasks with 2D pose estimation to associate human dance motion with musical beats (\cite{pose1}, \cite{pose2}). For the second goal of headbang.py, I propose to analyze headbanging motion in metal videos with the OpenPose 2D human pose estimation library. The results of the headbanging motion analysis can be displayed alongside the results of beat tracking, to potentially reveal some information about what drives the urge to headbang.
62-
63-
\vspace{1em}
64-
65-
\qquad One method for evaluating beat tracking results is overlaying clicks, or ``sonification'' of the beat annotations (\cite{clicks}), on the original track. This helps a person to verify that the clicks line up with their own perception of beat locations in listening tests. For an optional third goal of headbang.py (if time permits), I want to create a digital animation of a humanoid figure (2D or 3D) which headbangs on beat locations, as an alternative method of visualizing the outputs of beat trackers.
52+
One method for evaluating beat tracking results is overlaying clicks on predicted beats over the original track, to sonify the beat annotations (\cite{clicks}). This helps a person to verify that the clicks line up with their own perception of beat locations in listening tests. For an optional third goal of headbang.py (if time permits), I want to create a digital animation of a humanoid figure (2D or 3D) which headbangs on beat locations, as an alternative method of visualizing the outputs of beat trackers.
6653

6754
\vfill
6855
\clearpage

0 commit comments

Comments
 (0)