2018-11-12-tsnegoldberg

i wanted to use tsne to create a piece that explored a variety of timbres from a single limited domain

some ideas i had but did not end up doing for this assignment were

i ultimately settled on trying to do a piece that utilized small slices of glenn gould's goldberg variations because i thought i could get away with only doing the aria

i used soundflower to illegally get Goldberg Variations, BWV 988: Aria - Remastered, a song by Johann Sebastian Bach, Glenn Gould on Spotify into ableton where i could trim the onset and end of the recording

i then followed this tutorial from ML4A to learn how to use librosa for feature extraction for sklearn's implementation of tsne ml4a-guides/audio-tsne.ipynb at master · ml4a/ml4a-guides · GitHub

i cut up the aria into 1-second long clips

def cut_all():
    y, sr = librosa.load("./sf-goldberg.wav", mono=True, dtype="float32")
    for i, mini_sample in enumerate(mt.chunked(y, sr)):
        if len(mini_sample) == sr:
            librosa.output.write_wav(
                "./goldsecs/{}.wav".format(i), np.asarray(mini_sample), sr, True)
            print("saved {}.wav".format(i))
    return sr

then i used the same feature extraction method from the ML4A tutorial for using tsne with audio and used TSNE to reduce dimensionality to a single dimension

then i created a dataset of the schema

i = incrementing by time in the original recording
filename = where clip is on disk
feature_vec = numpy array of the same featurization from the tutorial
y = numpy array of the raw audio sample
pos = position on the axis of the single dimension i used TSNE to reduce the dataset to
def get_ds():
    ds = list(feats())
    loaded_feature_vec_list = map(op.itemgetter("feature_vec"), ds)

    feature_vec_list = list(filter(keep_feature_vector, loaded_feature_vec_list))
    positions = TSNE(n_components=1, learning_rate=150, perplexity=30,
                     verbose=2, angle=0.1).fit_transform(feature_vec_list)
    by_i = {d["i"]: d for d in ds}
    ds_with_pos = [{"pos": pos, **by_i[i]} for i, pos in enumerate(positions[:, 0])]
    return ds_with_pos

i then cut up the recording into 1-second long windows again, but then calculated the volume per segment

import more_itertools as mt
import numpy as np
import librosa

def get_vol(arr):
    return np.median(librosa.feature.rmse(arr))

volumes = list(map(lambda c: get_vol(np.asarray(c)), mt.chunked(y, sr)))

i normalized this list of volumes so that 0 meant the window had the lower volume in the recording and 1 was a window where the volume was amongst the highest. i then used this 0-1 number to select clips from the single axis i projected my corpus on to

def by_track_vol(y, sr, by_pos):
    volumes = list(map(lambda c: get_vol(np.asarray(c)), mt.chunked(y, sr)))
    rel_vols = normalized_arr(np.asarray(volumes))
    n_samples = len(by_pos)
    for rel_vol in rel_vols:
        pos = int(n_samples * rel_vol)
        if pos in by_pos:
            yield by_pos[pos]
        else:
            all_pos = list(by_pos.keys())
            yield by_pos[random.choice(all_pos)]

the piece is simply a rendering of a concatenation of the samples using this sequencing method

def dump_linear(out_filename, sorted_ds, sr):
    master_y = ft.reduce(
        lambda l, r: np.concatenate((l, r), axis=None),
        map(op.itemgetter("y"), sorted_ds)
    )
    librosa.output.write_wav(out_filename, master_y, sr, True)

ds = get_ds()
by_pos = {d["pos"]: d for d in ds}
y, sr = librosa.load("./sf-goldberg.wav", mono=True, dtype="float32")
sorted_by_track_vol = by_track_vol(y, sr, by_pos)
dump_linear("by_vol.wav", sorted_by_track_vol, sr)

i also rendered one that is simply sorted by the single axis of the TSNE dimensionality reduction

sorted_by_pos = sorted(ds, key=op.itemgetter("pos"))
dump_linear("by_dim.wav", sorted_by_track_vol, sr)

ALL CODE

import itertools as it
import random
import more_itertools as mt
import functools as ft
import asyncio
import operator as op
import time
import subprocess
import fnmatch
import os

import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
from sklearn.manifold import TSNE
import json


def files():
    path = "./LMNC LINN DRUM ST JOHNS/"
    path = "./goldsecs/"
    for root, dirnames, filenames in os.walk(path):
        for filename in fnmatch.filter(filenames, '*.wav'):
            yield os.path.join(root, filename)


def get_features(y, sr):
    y = y[0:sr]  # analyze just first second
    S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
    log_S = librosa.amplitude_to_db(S, ref=np.max)
    mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)
    delta_mfcc = librosa.feature.delta(mfcc, mode='nearest')
    delta2_mfcc = librosa.feature.delta(mfcc, order=2, mode='nearest')
    feature_vector = np.concatenate(
        (np.mean(mfcc, 1), np.mean(delta_mfcc, 1), np.mean(delta2_mfcc, 1)))
    feature_vector_norm = (
        feature_vector - np.mean(feature_vector)) / np.std(feature_vector)
    return feature_vector_norm


def feats():
    for i, f in enumerate(files()):
        try:
            y, sr = librosa.load(f)
            if len(y) < 2:
                continue
            feat = get_features(y, sr)
            yield {
                "i": i,
                "filename": f,
                "feature_vec": feat,
                "y": y
            }
        except:
            continue


def cut_all():
    y, sr = librosa.load("./sf-goldberg.wav", mono=True, dtype="float32")
    for i, mini_sample in enumerate(mt.chunked(y, sr)):
        if len(mini_sample) == sr:
            librosa.output.write_wav(
                "./goldsecs/{}.wav".format(i), np.asarray(mini_sample), sr, True)
            print("saved {}.wav".format(i))
    return sr


def keep_feature_vector(fv):
    if np.isnan(fv).any():
        return False
    return True


def get_ds():
    ds = list(feats())
    loaded_feature_vec_list = map(op.itemgetter("feature_vec"), ds)

    feature_vec_list = list(filter(keep_feature_vector, loaded_feature_vec_list))
    positions = TSNE(n_components=1, learning_rate=150, perplexity=30,
                     verbose=2, angle=0.1).fit_transform(feature_vec_list)
    by_i = {d["i"]: d for d in ds}
    ds_with_pos = [{"pos": pos, **by_i[i]} for i, pos in enumerate(positions[:, 0])]
    return ds_with_pos


def dump_linear(out_filename, sorted_ds, sr):
    master_y = ft.reduce(
        lambda l, r: np.concatenate((l, r), axis=None),
        map(op.itemgetter("y"), sorted_ds)
    )
    librosa.output.write_wav(out_filename, master_y, sr, True)


def get_vol(arr):
    return np.median(librosa.feature.rmse(arr))

def normalized_arr(arr):
    to_zero = (arr - np.min(arr))
    normalized = to_zero / np.max(to_zero)
    return normalized


def by_track_vol(y, sr, by_pos):
    volumes = list(map(lambda c: get_vol(np.asarray(c)), mt.chunked(y, sr)))
    rel_vols = normalized_arr(np.asarray(volumes))
    n_samples = len(by_pos)
    for rel_vol in rel_vols:
        pos = int(n_samples * rel_vol)
        if pos in by_pos:
            yield by_pos[pos]
        else:
            all_pos = list(by_pos.keys())
            yield by_pos[random.choice(all_pos)]


ds = get_ds()
by_pos = {d["pos"]: d for d in ds}
y, sr = librosa.load("./sf-goldberg.wav", mono=True, dtype="float32")
sorted_by_track_vol = by_track_vol(y, sr, by_pos)
dump_linear("by_vol.wav", sorted_by_track_vol, sr)

sorted_by_pos = sorted(ds, key=op.itemgetter("pos"))
dump_linear("by_dim.wav", sorted_by_track_vol, sr)