save x y groups as npz

beiciliang · Oct 19, 2018 · 8bf45ed · 8bf45ed
1 parent dcad0b9
commit 8bf45ed
Showing 1 changed file with 201 additions and 1 deletion.
diff --git a/4. test on real audio data.ipynb b/4. test on real audio data.ipynb
@@ -47,6 +47,7 @@
     "from keras.layers.merge import Concatenate\n",
     "from keras import backend as K\n",
     "from keras.backend.tensorflow_backend import set_session\n",
+    "from keras.layers import concatenate as concat\n",
     "\n",
     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\" # the number of the GPU\n",
     "config = tf.ConfigProto()\n",
@@ -59,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -799,6 +800,205 @@
     "# plt.savefig(\"testfig.png\", bbox_inches=\"tight\", dpi=300)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Use transfer learning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "__________________________________________________________________________________________________\n",
+      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
+      "==================================================================================================\n",
+      "input_2 (InputLayer)            (None, 1, 88200)     0                                            \n",
+      "__________________________________________________________________________________________________\n",
+      "melspectrogram_2 (Melspectrogra (None, 128, 200, 1)  1116288     input_2[0][0]                    \n",
+      "__________________________________________________________________________________________________\n",
+      "batch_normalization_2 (BatchNor (None, 128, 200, 1)  4           melspectrogram_2[0][0]           \n",
+      "__________________________________________________________________________________________________\n",
+      "conv2d_1 (Conv2D)               (None, 128, 200, 7)  427         batch_normalization_2[0][0]      \n",
+      "__________________________________________________________________________________________________\n",
+      "conv2d_2 (Conv2D)               (None, 128, 200, 7)  70          batch_normalization_2[0][0]      \n",
+      "__________________________________________________________________________________________________\n",
+      "conv2d_3 (Conv2D)               (None, 128, 200, 7)  427         batch_normalization_2[0][0]      \n",
+      "__________________________________________________________________________________________________\n",
+      "concatenate_1 (Concatenate)     (None, 128, 200, 21) 0           conv2d_1[0][0]                   \n",
+      "                                                                 conv2d_2[0][0]                   \n",
+      "                                                                 conv2d_3[0][0]                   \n",
+      "__________________________________________________________________________________________________\n",
+      "batch_normalization_3 (BatchNor (None, 128, 200, 21) 84          concatenate_1[0][0]              \n",
+      "__________________________________________________________________________________________________\n",
+      "activation_1 (Activation)       (None, 128, 200, 21) 0           batch_normalization_3[0][0]      \n",
+      "__________________________________________________________________________________________________\n",
+      "max_pooling2d_1 (MaxPooling2D)  (None, 64, 100, 21)  0           activation_1[0][0]               \n",
+      "__________________________________________________________________________________________________\n",
+      "dropout_1 (Dropout)             (None, 64, 100, 21)  0           max_pooling2d_1[0][0]            \n",
+      "__________________________________________________________________________________________________\n",
+      "conv2d_4 (Conv2D)               (None, 64, 100, 21)  3990        dropout_1[0][0]                  \n",
+      "__________________________________________________________________________________________________\n",
+      "batch_normalization_4 (BatchNor (None, 64, 100, 21)  84          conv2d_4[0][0]                   \n",
+      "__________________________________________________________________________________________________\n",
+      "activation_2 (Activation)       (None, 64, 100, 21)  0           batch_normalization_4[0][0]      \n",
+      "__________________________________________________________________________________________________\n",
+      "max_pooling2d_2 (MaxPooling2D)  (None, 32, 50, 21)   0           activation_2[0][0]               \n",
+      "__________________________________________________________________________________________________\n",
+      "dropout_2 (Dropout)             (None, 32, 50, 21)   0           max_pooling2d_2[0][0]            \n",
+      "__________________________________________________________________________________________________\n",
+      "conv2d_5 (Conv2D)               (None, 32, 50, 21)   3990        dropout_2[0][0]                  \n",
+      "__________________________________________________________________________________________________\n",
+      "batch_normalization_5 (BatchNor (None, 32, 50, 21)   84          conv2d_5[0][0]                   \n",
+      "__________________________________________________________________________________________________\n",
+      "activation_3 (Activation)       (None, 32, 50, 21)   0           batch_normalization_5[0][0]      \n",
+      "__________________________________________________________________________________________________\n",
+      "max_pooling2d_3 (MaxPooling2D)  (None, 16, 25, 21)   0           activation_3[0][0]               \n",
+      "__________________________________________________________________________________________________\n",
+      "dropout_3 (Dropout)             (None, 16, 25, 21)   0           max_pooling2d_3[0][0]            \n",
+      "__________________________________________________________________________________________________\n",
+      "conv2d_6 (Conv2D)               (None, 16, 25, 21)   3990        dropout_3[0][0]                  \n",
+      "__________________________________________________________________________________________________\n",
+      "batch_normalization_6 (BatchNor (None, 16, 25, 21)   84          conv2d_6[0][0]                   \n",
+      "__________________________________________________________________________________________________\n",
+      "activation_4 (Activation)       (None, 16, 25, 21)   0           batch_normalization_6[0][0]      \n",
+      "__________________________________________________________________________________________________\n",
+      "max_pooling2d_4 (MaxPooling2D)  (None, 4, 7, 21)     0           activation_4[0][0]               \n",
+      "__________________________________________________________________________________________________\n",
+      "dropout_4 (Dropout)             (None, 4, 7, 21)     0           max_pooling2d_4[0][0]            \n",
+      "__________________________________________________________________________________________________\n",
+      "global_average_pooling2d_1 (Glo (None, 21)           0           dropout_4[0][0]                  \n",
+      "__________________________________________________________________________________________________\n",
+      "dense_1 (Dense)                 (None, 2)            44          global_average_pooling2d_1[0][0] \n",
+      "==================================================================================================\n",
+      "Total params: 1,129,566\n",
+      "Trainable params: 13,108\n",
+      "Non-trainable params: 1,116,458\n",
+      "__________________________________________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "npz_dir = os.path.join(DIR_REAL_DATA, 'reference') \n",
+    "dataset_name = 'pedal-times_realaudio.npz'\n",
+    "npz_path = os.path.join(npz_dir, dataset_name)\n",
+    "\n",
+    "tracks = np.load(npz_path)\n",
+    "filenames = tracks['filename']\n",
+    "pedal_offset_gt_tracks = tracks['pedal_offset']\n",
+    "pedal_onset_gt_tracks = tracks['pedal_onset']\n",
+    "\n",
+    "# get model\n",
+    "batch_size = 1\n",
+    "reg_w = 1e-4\n",
+    "model_name = 'multi_kernel'\n",
+    "segment_exp_name = 'segment_{}'.format(model_name)\n",
+    "segment_model = model_multi_kernel_shape(n_out=2,input_shape=SEGMENT_INPUT_SHAPE)\n",
+    "segment_model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])\n",
+    "# load weights\n",
+    "segment_model.load_weights(os.path.join(DIR_SAVE_MODEL,\"{}_best_weights.h5\".format(segment_exp_name)))\n",
+    "# get model summary\n",
+    "segment_model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ChopinOp66...\n",
+      "ChopinOp28No20...\n",
+      "ChopinOp10No3...\n",
+      "ChopinB49...\n",
+      "ChopinOp28No15...\n",
+      "ChopinOp28No4...\n",
+      "ChopinOp28No7...\n",
+      "ChopinOp23No1...\n",
+      "ChopinOp28No6...\n",
+      "ChopinOp69No2...\n"
+     ]
+    }
+   ],
+   "source": [
+    "# use as feature extractor\n",
+    "feat_layer1 = GlobalAveragePooling2D()(segment_model.get_layer('activation_1').output)\n",
+    "feat_layer2 = GlobalAveragePooling2D()(segment_model.get_layer('activation_2').output)\n",
+    "feat_layer3 = GlobalAveragePooling2D()(segment_model.get_layer('activation_3').output)\n",
+    "feat_layer4 = GlobalAveragePooling2D()(segment_model.get_layer('activation_4').output)\n",
+    "segment_feat_all = concat([feat_layer1,feat_layer2,feat_layer3,feat_layer4])\n",
+    "segment_feat_extractor = Model(inputs = segment_model.input, outputs = segment_feat_all)\n",
+    "\n",
+    "for filename_idx, filename in enumerate(filenames):  \n",
+    "    pedal_offset_gt = np.array(pedal_offset_gt_tracks[filename_idx])\n",
+    "    pedal_onset_gt = np.array(pedal_onset_gt_tracks[filename_idx])\n",
+    "\n",
+    "    paudio_dir = os.path.join(DIR_REAL_DATA, '{}'.format(filename)) \n",
+    "    paudio_path = os.path.join(paudio_dir, '{}.wav'.format(filename))\n",
+    "\n",
+    "    paudio, sr = librosa.load(paudio_path, sr=SR) \n",
+    "    print(\"{}...\".format(filename))\n",
+    "    len_segment_shape = int(SR * MIN_SRC)\n",
+    "    seghop_length = HOP_LENGTH*10\n",
+    "    seghop_duration = seghop_length/SR\n",
+    "    n_psegment = int(np.ceil((len(paudio)-len_segment_shape)/seghop_length))\n",
+    "    gen_psegment = data_gen(paudio, n_psegment, len_segment_shape, 'segment', hop_length=seghop_length)\n",
+    "    segment_feat = segment_feat_extractor.predict_generator(gen_psegment, n_psegment// batch_size)\n",
+    "\n",
+    "    # set the ground truth frame by frame\n",
+    "    paudio_duration = librosa.get_duration(y=paudio, sr=SR)\n",
+    "    n_frames = int(np.ceil(paudio_duration/seghop_duration))\n",
+    "    segframes_gt = np.zeros(n_frames)\n",
+    "    segframes_est = np.zeros(n_frames)\n",
+    "\n",
+    "    pedal_offset_gt = np.array(tracks['pedal_offset'][filename_idx])\n",
+    "    pedal_onset_gt = np.array(tracks['pedal_onset'][filename_idx])\n",
+    "    longpseg_idx = np.where((pedal_offset_gt-pedal_onset_gt)>seghop_duration)[0]\n",
+    "    longseg_onset_gt = pedal_onset_gt[longpseg_idx]\n",
+    "    longseg_offset_gt = pedal_offset_gt[longpseg_idx]\n",
+    "    segintervals_gt = np.stack((longseg_onset_gt,longseg_offset_gt), axis=-1)\n",
+    "\n",
+    "    for idx, onset_t in enumerate(longseg_onset_gt):\n",
+    "        offset_t = longseg_offset_gt[idx]\n",
+    "        onset_frm = int(onset_t//seghop_duration)\n",
+    "        offset_frm = int(offset_t//seghop_duration)\n",
+    "        segframes_gt[onset_frm:offset_frm] = 1   \n",
+    "    \n",
+    "    # align the segframes_gt to the features from transfer learning\n",
+    "    segframes_gt_transfer = segframes_gt[1:1+segment_feat.shape[0]]\n",
+    "    group = np.array([filename] * segment_feat.shape[0])\n",
+    "    \n",
+    "    # concat\n",
+    "    if filename_idx == 0:\n",
+    "        segment_feats = np.copy(segment_feat)\n",
+    "        segframes_gt_transfers = np.copy(segframes_gt_transfer)\n",
+    "        groups = np.copy(group)\n",
+    "    else:\n",
+    "        segment_feats = np.concatenate((segment_feats, segment_feat),axis=0)\n",
+    "        segframes_gt_transfers = np.concatenate((segframes_gt_transfers, segframes_gt_transfer),axis=0)\n",
+    "        groups = np.concatenate((groups, group),axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_dir = os.path.join(DIR_REAL_DATA, 'reference')                            \n",
+    "np.savez(os.path.join(save_dir, 'transfer-learning-xyg_segment.npz'), \n",
+    "         X=segment_feats, y=segframes_gt_transfers, groups=groups)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,