Emergent Protocol

Downloads:

24bit FLAC (690 MB) VBR MP3 (78 MB) Ogg Vorbis (37 MB) C++ source code (4 kB)

A feedback process iteratively amplifies audio recognized from a control signal. In this version the control signal was one minute excerpted from the BBC Radio 4 drama serial "The Archers", and the source signal was 3 seconds of pink noise. Inspired by Deep Dream, but no neural networks were involved this time.

To be more precise, for each overlapping window of source audio, the program finds the nearest matching window of control audio, where the metric used is the vector difference of energy per octave. The match is added to the input, and appended to the source audio. The process continues over the rolling buffer, eventually feeding back on itself (the previous output is now the new input).

To be absolutely precise, here's the C++ source code using libsndfile and libfftw3:

/*
  Emergent Protocol (c) 2017 Claude Heiland-Allen <claude@mathr.co.uk>
  g++ -std=c++11 -Wall -Wextra -pedantic -O3 -o ep ep.cpp -lfftw3f -lsndfile
  ./ep control.wav source.wav output.wav
*/

#include <cassert>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <complex>
#include <vector>
#include <sndfile.h>
#include <fftw3.h>

#define CHANNELS 2
#define OCTAVES 11
#define FFTSIZE (1 << (OCTAVES + 2))
#define OVERLAP 4
#define SR 44100
#define LENGTH (SR * 60 * 60)

struct energy
{
  float e[CHANNELS][OCTAVES];
  float a[FFTSIZE][CHANNELS];
};

float *window;

float *fft_in;
std::complex<float> *fft_out;
fftwf_plan plan;

std::vector<energy> analyse(const char *filename)
{
  std::vector<energy> result;


  SF_INFO info = { 0, 0, 0, 0, 0, 0 };
  SNDFILE *sndfile = sf_open(filename, SFM_READ, &info);
  assert(info.channels == CHANNELS);

  energy e;
  while (FFTSIZE == sf_readf_float(sndfile, &e.a[0][0], FFTSIZE))
  {
    float sum = 0;
    for (int c = 0; c < CHANNELS; ++c)
    {
      for (int s = 0; s < FFTSIZE; ++s)
      {
        fft_in[s] = window[s] * e.a[s][c];
      }
      fftwf_execute(plan);
      for (int i = 1, o = 0; o < OCTAVES; ++o, i <<= 1)
      {
        e.e[c][o] = 0;
        for (int s = i; s < i << 1; ++s)
        {
          sum += e.e[c][o] += std::norm(fft_out[s]);
        }
      }
    }
    for (int c = 0; c < CHANNELS; ++c)
    {
      for (int o = 0; o < OCTAVES; ++o)
      {
        e.e[c][o] /= sum;
      }
    }
    result.push_back(e);
    sf_seek(sndfile, -(OVERLAP - 1) * FFTSIZE / OVERLAP, SEEK_CUR);
  }

  sf_close(sndfile);

  return result;
}

float audio[LENGTH][CHANNELS];

void generate(const std::vector<energy> &analysis, const char *infilename, const char *outfilename)
{
  memset(&audio[0][0], 0, LENGTH * CHANNELS * sizeof(audio[0][0]));

  SF_INFO srcinfo = { 0, 0, 0, 0, 0, 0 };
  SNDFILE *srcsndfile = sf_open(infilename, SFM_READ, &srcinfo);
  assert(srcinfo.channels == CHANNELS);
  sf_count_t r = 0;
  sf_count_t w = sf_readf_float(srcsndfile, &audio[r][0], LENGTH);
  sf_close(srcsndfile);

  while (w + FFTSIZE < LENGTH)
  {
    energy e;
    float sum = 0;
    for (int c = 0; c < CHANNELS; ++c)
    {
      for (int s = 0; s < FFTSIZE; ++s)
      {
        fft_in[s] = window[s] * audio[r + s][c];
      }
      fftwf_execute(plan);
      for (int i = 1, o = 0; o < OCTAVES; ++o, i <<= 1)
      {
        e.e[c][o] = 0;
        for (int s = i; s < i << 1; ++s)
        {
          sum += e.e[c][o] += std::norm(fft_out[s]);
        }
      }
    }
    for (int c = 0; c < CHANNELS; ++c)
    {
      for (int o = 0; o < OCTAVES; ++o)
      {
        e.e[c][o] /= sum;
      }
    }
    auto target = analysis.begin();
    float m = 1.0f/0.0f;
    for (auto i = analysis.begin(); i != analysis.end(); ++i)
    {
      float s = 0;
      for (int c = 0; c < CHANNELS; ++c)
      {
        for (int o = 0; o < OCTAVES; ++o)
        {
          float d = (*i).e[c][o] - e.e[c][o];
          s += d * d;
        }
      }
      if (s < m)
      {
        m = s;
        target = i;
      }
    }
    for (int c = 0; c < CHANNELS; ++c)
    {
      for (int s = 0; s < FFTSIZE; ++s)
      {
        audio[w + s][c] += window[s] * sin(audio[r + s][c] - (*target).a[s][c]);
      }
    }

    r += FFTSIZE / OVERLAP;
    w += FFTSIZE / OVERLAP;
  }

  SF_INFO dstinfo = { 0, SR, CHANNELS, SF_FORMAT_WAV | SF_FORMAT_FLOAT, 0, 0 };
  SNDFILE *dstsndfile = sf_open(outfilename, SFM_WRITE, &dstinfo);
  sf_writef_float(dstsndfile, &audio[0][0], LENGTH);
}

int main(int argc, char **argv)
{
  if (! (argc > 3))
    return 0;
  window = (float *)fftwf_malloc(FFTSIZE * sizeof(*window));
  float g = 0.25;
  for (int s = 0; s < FFTSIZE; ++s)
  {
    window[s] = g * (1 - std::cos(2 * 3.141592653589793 * s / FFTSIZE));
  }
  fft_in = (float *)fftwf_malloc(FFTSIZE * sizeof(*fft_in));
  fft_out = (std::complex<float> *)fftwf_malloc(FFTSIZE * sizeof(*fft_out));
  plan = fftwf_plan_dft_r2c_1d(FFTSIZE, fft_in, (float (*)[2])fft_out, FFTW_PATIENT | FFTW_PRESERVE_INPUT);
  generate(analyse(argv[1]), argv[2], argv[3]);
  fftwf_destroy_plan(plan);
  fftwf_free(fft_out);
  fftwf_free(fft_in);
  return 0;
}

Eventually I want to learn enough about machine learning to re-implement this idea using neural networks, perhaps trained on a variety of classical instrument sounds, to create an orchestral noise symphony.