Spaces:

PreciousMposa
/

audio

Configuration error

App Files Files Community

PreciousMposa commited on Jul 1

Commit

519d358

verified ·

1 Parent(s): 16f476f

Upload 107 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +27 -0
.gitattributes +2 -0
.github/ISSUE_TEMPLATE/bug.md +33 -0
.github/ISSUE_TEMPLATE/question.md +10 -0
.github/workflows/linter.yml +36 -0
.github/workflows/tests.yml +36 -0
.gitignore +17 -0
.vscode/launch.json +19 -0
.vscode/tasks.json +33 -0
CODE_OF_CONDUCT.md +76 -0
CONTRIBUTING.md +23 -0
Demucs.ipynb +153 -0
Dockerfile +23 -0
LICENSE +21 -0
MANIFEST.in +13 -0
Makefile +36 -0
README.md +319 -14
app.py +37 -37
conf/config.yaml +304 -0
conf/dset/aetl.yaml +19 -0
conf/dset/auto_extra_test.yaml +18 -0
conf/dset/auto_mus.yaml +20 -0
conf/dset/extra44.yaml +8 -0
conf/dset/extra_mmi_goodclean.yaml +12 -0
conf/dset/extra_test.yaml +12 -0
conf/dset/musdb44.yaml +5 -0
conf/dset/sdx23_bleeding.yaml +10 -0
conf/dset/sdx23_labelnoise.yaml +10 -0
conf/svd/base.yaml +14 -0
conf/svd/base2.yaml +14 -0
conf/svd/default.yaml +1 -0
conf/variant/default.yaml +1 -0
conf/variant/example.yaml +5 -0
conf/variant/finetune.yaml +19 -0
demucs.png +3 -0
demucs/__init__.py +7 -0
demucs/__main__.py +10 -0
demucs/api.py +392 -0
demucs/apply.py +322 -0
demucs/audio.py +265 -0
demucs/augment.py +111 -0
demucs/demucs.py +447 -0
demucs/distrib.py +100 -0
demucs/ema.py +66 -0
demucs/evaluate.py +174 -0
demucs/grids/__init__.py +0 -0
demucs/grids/_explorers.py +64 -0
demucs/grids/mdx.py +33 -0
demucs/grids/mdx_extra.py +36 -0
demucs/grids/mdx_refine.py +34 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,27 @@

+**/__pycache__
+**/.venv
+**/.classpath
+**/.dockerignore
+**/.env
+**/.git
+**/.gitignore
+**/.project
+**/.settings
+**/.toolstarget
+**/.vs
+**/.vscode
+**/*.*proj.user
+**/*.dbmdl
+**/*.jfm
+**/bin
+**/charts
+**/docker-compose*
+**/compose*
+**/Dockerfile*
+**/node_modules
+**/npm-debug.log
+**/obj
+**/secrets.dev.yaml
+**/values.dev.yaml
+LICENSE
+README.md

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demucs.png filter=lfs diff=lfs merge=lfs -text
+test.mp3 filter=lfs diff=lfs merge=lfs -text

.github/ISSUE_TEMPLATE/bug.md ADDED Viewed

	@@ -0,0 +1,33 @@

+---
+name: 🐛 Bug Report
+about: Submit a bug report to help us improve
+labels: 'bug'
+---
+## 🐛 Bug Report
+(A clear and concise description of what the bug is)
+## To Reproduce
+(Write your steps here:)
+1. Step 1...
+1. Step 2...
+1. Step 3...
+## Expected behavior
+(Write what you thought would happen.)
+## Actual Behavior
+(Write what happened. Add screenshots, if applicable.)
+## Your Environment
+<!-- Include as many relevant details about the environment you experienced the bug in -->
+- Python and PyTorch version:
+- Operating system and version (desktop or mobile):
+- Hardware (gpu or cpu, amount of RAM etc.):

.github/ISSUE_TEMPLATE/question.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+name: "❓Questions/Help/Support"
+about: If you have a question about the paper, code or algorithm, please ask here!
+labels: question
+---
+## ❓ Questions
+(Please ask your question here.)

.github/workflows/linter.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+name: linter
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    if: ${{ github.repository == 'facebookresearch/demucs' || github.event_name == 'workflow_dispatch' }}
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - uses: actions/cache@v2
+      with:
+        path: env
+        key: env-${{ hashFiles('**/requirements.txt', '.github/workflows/*') }}
+    - name: Install dependencies
+      run: |
+        python3 -m venv env
+        . env/bin/activate
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install '.[dev]'
+    - name: Run linter
+      run: |
+        . env/bin/activate
+        make linter

.github/workflows/tests.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+name: tests
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    if: ${{ github.repository == 'facebookresearch/demucs' || github.event_name == 'workflow_dispatch' }}
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - uses: actions/cache@v2
+      with:
+        path: env
+        key: env-${{ hashFiles('**/requirements.txt', '.github/workflows/*') }}
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y ffmpeg
+        python3 -m venv env
+        . env/bin/activate
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Run separation test
+      run: |
+        . env/bin/activate
+        make test_eval

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+*.egg-info
+__pycache__
+Session.vim
+/build
+/dist
+/lab
+/metadata
+/notebooks
+/outputs
+/release
+/release_models
+/separated
+/tests
+/trash
+/misc
+/mdx
+.mypy_cache

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "configurations": [
+        {
+            "name": "Containers: Python - Fastapi",
+            "type": "docker",
+            "request": "launch",
+            "preLaunchTask": "docker-run: debug",
+            "python": {
+                "pathMappings": [
+                    {
+                        "localRoot": "${workspaceFolder}",
+                        "remoteRoot": "/app"
+                    }
+                ],
+                "projectType": "fastapi"
+            }
+        }
+    ]
+}

.vscode/tasks.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+	"version": "2.0.0",
+	"tasks": [
+		{
+			"type": "docker-build",
+			"label": "docker-build",
+			"platform": "python",
+			"dockerBuild": {
+				"tag": "demucs:latest",
+				"dockerfile": "${workspaceFolder}/Dockerfile",
+				"context": "${workspaceFolder}",
+				"pull": true
+			}
+		},
+		{
+			"type": "docker-run",
+			"label": "docker-run: debug",
+			"dependsOn": [
+				"docker-build"
+			],
+			"python": {
+				"args": [
+					"predict:app",
+					"--host",
+					"0.0.0.0",
+					"--port",
+					"8000"
+				],
+				"module": "uvicorn"
+			}
+		}
+	]
+}

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <[email protected]>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Contributing to Demucs
+## Pull Requests
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+Demucs is the implementation of a research paper.
+Therefore, we do not plan on accepting many pull requests for new features.
+We certainly welcome them for bug fixes.
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+## License
+By contributing to this repository, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

Demucs.ipynb ADDED Viewed

	@@ -0,0 +1,153 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Be9yoh-ILfRr"
+   },
+   "source": [
+    "# Hybrid Demucs\n",
+    "\n",
+    "Feel free to use the Colab version:\n",
+    "https://colab.research.google.com/drive/1dC9nVxk3V_VPjUADsnFu8EiT-xnU1tGH?usp=sharing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 139
+    },
+    "colab_type": "code",
+    "executionInfo": {
+     "elapsed": 12277,
+     "status": "ok",
+     "timestamp": 1583778134659,
+     "user": {
+      "displayName": "Marllus Lustosa",
+      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GgLl2RbW64ZyWz3Y8IBku0zhHCMnt7fz7fEl0LTdA=s64",
+      "userId": "14811735256675200480"
+     },
+     "user_tz": 180
+    },
+    "id": "kOjIPLlzhPfn",
+    "outputId": "c75f17ec-b576-4105-bc5b-c2ac9c1018a3"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U demucs\n",
+    "# or for local development, if you have a clone of Demucs\n",
+    "# pip install -e ."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "5lYOzKKCKAbJ"
+   },
+   "outputs": [],
+   "source": [
+    "# You can use the `demucs` command line to separate tracks\n",
+    "!demucs test.mp3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You can also load directly the pretrained models,\n",
+    "# for instance for the MDX 2021 winning model of Track A:\n",
+    "from demucs import pretrained\n",
+    "model = pretrained.get_model('mdx')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Because `model` is a bag of 4 models, you cannot directly call it on your data,\n",
+    "# but the `apply_model` will know what to do of it.\n",
+    "import torch\n",
+    "from demucs.apply import apply_model\n",
+    "x = torch.randn(1, 2, 44100 * 10)  # ten seconds of white noise for the demo\n",
+    "out = apply_model(model, x)[0]     # shape is [S, C, T] with S the number of sources\n",
+    "\n",
+    "# So let see, where is all the white noise content is going ?\n",
+    "for name, source in zip(model.sources, out):\n",
+    "    print(name, source.std() / x.std())\n",
+    "# The outputs are quite weird to be fair, not what I would have expected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now let's take a single model from the bag, and let's test it on a pure cosine\n",
+    "freq = 440  # in Hz\n",
+    "sr = model.samplerate\n",
+    "t = torch.arange(10 * sr).float() / sr\n",
+    "x = torch.cos(2 * 3.1416 * freq * t).expand(1, 2, -1)\n",
+    "sub_model = model.models[3]\n",
+    "out = sub_model(x)[0]\n",
+    "\n",
+    "# Same question where does it go?\n",
+    "for name, source in zip(model.sources, out):\n",
+    "    print(name, source.std() / x.std())\n",
+    "    \n",
+    "# Well now it makes much more sense, all the energy is going\n",
+    "# in the `other` source.\n",
+    "# Feel free to try lower pitch (try 80 Hz) to see what happens !"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For training or more fun, refer to the Demucs README on our repo\n",
+    "# https://github.com/facebookresearch/demucs/tree/main/demucs"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "authorship_tag": "ABX9TyM9xpVr1M86NRcjtQ7g9tCx",
+   "collapsed_sections": [],
+   "name": "Demucs.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Use Python 3.9 slim base
+FROM python:3.9-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y ffmpeg git && apt-get clean
+# Set work directory
+WORKDIR /app
+# Install Python packages
+RUN pip install --upgrade pip
+RUN pip install torch torchaudio
+RUN pip install fastapi uvicorn
+RUN pip install git+https://github.com/facebookresearch/demucs
+# Copy your inference script into the container
+COPY predict.py .
+# Expose port for FastAPI
+EXPOSE 8000
+# Run the FastAPI app
+CMD ["uvicorn", "predict:app", "--host", "0.0.0.0", "--port", "8000"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,13 @@

+recursive-exclude env *
+recursive-include conf *.yaml
+include Makefile
+include LICENSE
+include demucs.png
+include outputs.tar.gz
+include test.mp3
+include requirements.txt
+include requirements_minimal.txt
+include mypy.ini
+include demucs/py.typed
+include demucs/remote/*.txt
+include demucs/remote/*.yaml

Makefile ADDED Viewed

	@@ -0,0 +1,36 @@

+all: linter tests
+linter:
+	flake8 demucs
+	mypy demucs
+tests: test_train test_eval
+test_train: tests/musdb
+	_DORA_TEST_PATH=/tmp/demucs python3 -m dora run --clear \
+		dset.musdb=./tests/musdb dset.segment=4 dset.shift=2 epochs=2 model=demucs \
+		demucs.depth=2 demucs.channels=4 test.sdr=false misc.num_workers=0 test.workers=0 \
+		test.shifts=0
+test_eval:
+	python3 -m demucs -n demucs_unittest test.mp3
+	python3 -m demucs -n demucs_unittest --two-stems=vocals test.mp3
+	python3 -m demucs -n demucs_unittest --mp3 test.mp3
+	python3 -m demucs -n demucs_unittest --flac --int24 test.mp3
+	python3 -m demucs -n demucs_unittest --int24 --clip-mode clamp test.mp3
+	python3 -m demucs -n demucs_unittest --segment 8 test.mp3
+	python3 -m demucs.api -n demucs_unittest --segment 8 test.mp3
+	python3 -m demucs --list-models
+tests/musdb:
+	test -e tests || mkdir tests
+	python3 -c 'import musdb; musdb.DB("tests/tmp", download=True)'
+	musdbconvert tests/tmp tests/musdb
+dist:
+	python3 setup.py sdist
+clean:
+	rm -r dist build *.egg-info
+.PHONY: linter dist test_train test_eval

README.md CHANGED Viewed

@@ -1,14 +1,319 @@
----
-title: Audio
-emoji: 📈
-colorFrom: pink
-colorTo: blue
-sdk: gradio
-sdk_version: 5.35.0
-app_file: app.py
-pinned: false
-license: unknown
-short_description: audio processor
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Demucs Music Source Separation
+[![Support Ukraine](https://img.shields.io/badge/Support-Ukraine-FFD500?style=flat&labelColor=005BBB)](https://opensource.fb.com/support-ukraine)
+![tests badge](https://github.com/facebookresearch/demucs/workflows/tests/badge.svg)
+![linter badge](https://github.com/facebookresearch/demucs/workflows/linter/badge.svg)
+**Important:** As I am no longer working at Meta, **this repository is not maintained anymore**.
+I've created a fork at [github.com/adefossez/demucs](https://github.com/adefossez/demucs). Note that this project is not actively maintained anymore
+and only important bug fixes will be processed on the new repo. Please do not open issues for feature request or if Demucs doesn't work perfectly for your use case :)
+This is the 4th release of Demucs (v4), featuring Hybrid Transformer based source separation.
+**For the classic Hybrid Demucs (v3):** [Go this commit][demucs_v3].
+If you are experiencing issues and want the old Demucs back, please file an issue, and then you can get back to Demucs v3 with
+`git checkout v3`. You can also go [Demucs v2][demucs_v2].
+Demucs is a state-of-the-art music source separation model, currently capable of separating
+drums, bass, and vocals from the rest of the accompaniment.
+Demucs is based on a U-Net convolutional architecture inspired by [Wave-U-Net][waveunet].
+The v4 version features [Hybrid Transformer Demucs][htdemucs], a hybrid spectrogram/waveform separation model using Transformers.
+It is based on [Hybrid Demucs][hybrid_paper] (also provided in this repo), with the innermost layers
+replaced by a cross-domain Transformer Encoder. This Transformer uses self-attention within each domain,
+and cross-attention across domains.
+The model achieves a SDR of 9.00 dB on the MUSDB HQ test set. Moreover, when using sparse attention
+kernels to extend its receptive field and per source fine-tuning, we achieve state-of-the-art 9.20 dB of SDR.
+Samples are available [on our sample page](https://ai.honu.io/papers/htdemucs/index.html).
+Checkout [our paper][htdemucs] for more information.
+It has been trained on the [MUSDB HQ][musdb] dataset + an extra training dataset of 800 songs.
+This model separates drums, bass and vocals and other stems for any song.
+As Hybrid Transformer Demucs is brand new, it is not activated by default, you can activate it in the usual
+commands described hereafter with `-n htdemucs_ft`.
+The single, non fine-tuned model is provided as `-n htdemucs`, and the retrained baseline
+as `-n hdemucs_mmi`. The Sparse Hybrid Transformer model decribed in our paper is not provided as its
+requires custom CUDA code that is not ready for release yet.
+We are also releasing an experimental 6 sources model, that adds a `guitar` and `piano` source.
+Quick testing seems to show okay quality for `guitar`, but a lot of bleeding and artifacts for the `piano` source.
+<p align="center">
+<img src="./demucs.png" alt="Schema representing the structure of Hybrid Transformer Demucs,
+    with a dual U-Net structure, one branch for the temporal domain,
+    and one branch for the spectral domain. There is a cross-domain Transformer between the Encoders and Decoders."
+width="800px"></p>
+## Important news if you are already using Demucs
+See the [release notes](./docs/release.md) for more details.
+- 22/02/2023: added support for the [SDX 2023 Challenge](https://www.aicrowd.com/challenges/sound-demixing-challenge-2023),
+    see the dedicated [doc page](./docs/sdx23.md)
+- 07/12/2022: Demucs v4 now on PyPI. **htdemucs** model now used by default. Also releasing
+    a 6 sources models (adding `guitar` and `piano`, although the latter doesn't work so well at the moment).
+- 16/11/2022: Added the new **Hybrid Transformer Demucs v4** models.
+	Adding support for the [torchaudio implementation of HDemucs](https://pytorch.org/audio/stable/tutorials/hybrid_demucs_tutorial.html).
+- 30/08/2022: added reproducibility and ablation grids, along with an updated version of the paper.
+- 17/08/2022: Releasing v3.0.5: Set split segment length to reduce memory. Compatible with pyTorch 1.12.
+- 24/02/2022: Releasing v3.0.4: split into two stems (i.e. karaoke mode).
+    Export as float32 or int24.
+- 17/12/2021: Releasing v3.0.3: bug fixes  (thanks @keunwoochoi), memory drastically
+    reduced on GPU (thanks @famzah) and new multi-core evaluation on CPU (`-j` flag).
+- 12/11/2021: Releasing **Demucs v3** with hybrid domain separation. Strong improvements
+	on all sources. This is the model that won Sony MDX challenge.
+- 11/05/2021: Adding support for MusDB-HQ and arbitrary wav set, for the MDX challenge. For more information
+on joining the challenge with Demucs see [the Demucs MDX instructions](docs/mdx.md)
+## Comparison with other models
+We provide hereafter a summary of the different metrics presented in the paper.
+You can also compare Hybrid Demucs (v3), [KUIELAB-MDX-Net][kuielab], [Spleeter][spleeter], Open-Unmix, Demucs (v1), and Conv-Tasnet on one of my favorite
+songs on my [soundcloud playlist][soundcloud].
+### Comparison of accuracy
+`Overall SDR` is the mean of the SDR for each of the 4 sources, `MOS Quality` is a rating from 1 to 5
+of the naturalness and absence of artifacts given by human listeners (5 = no artifacts), `MOS Contamination`
+is a rating from 1 to 5 with 5 being zero contamination by other sources. We refer the reader to our [paper][hybrid_paper],
+for more details.
+| Model                        | Domain      | Extra data?       | Overall SDR | MOS Quality | MOS Contamination |
+|------------------------------|-------------|-------------------|-------------|-------------|-------------------|
+| [Wave-U-Net][waveunet]       | waveform    | no                | 3.2         | -           | -                 |
+| [Open-Unmix][openunmix]      | spectrogram | no                | 5.3         | -           | -                 |
+| [D3Net][d3net]               | spectrogram | no                | 6.0         | -           | -                 |
+| [Conv-Tasnet][demucs_v2]     | waveform    | no                | 5.7         | -           |                   |
+| [Demucs (v2)][demucs_v2]     | waveform    | no                | 6.3         | 2.37        | 2.36              |
+| [ResUNetDecouple+][decouple] | spectrogram | no                | 6.7         | -           | -                 |
+| [KUIELAB-MDX-Net][kuielab]   | hybrid      | no                | 7.5         | **2.86**    | 2.55              |
+| [Band-Spit RNN][bandsplit]   | spectrogram | no                | **8.2**     | -           | -                 |
+| **Hybrid Demucs (v3)**       | hybrid      | no                | 7.7         | **2.83**    | **3.04**          |
+| [MMDenseLSTM][mmdenselstm]   | spectrogram | 804 songs         | 6.0         | -           | -                 |
+| [D3Net][d3net]               | spectrogram | 1.5k songs        | 6.7         | -           | -                 |
+| [Spleeter][spleeter]         | spectrogram | 25k songs         | 5.9         | -           | -                 |
+| [Band-Spit RNN][bandsplit]   | spectrogram | 1.7k (mixes only) | **9.0**     | -           | -                 |
+| **HT Demucs f.t. (v4)**      | hybrid      | 800 songs         | **9.0**     | -           | -                 |
+## Requirements
+You will need at least Python 3.8. See `requirements_minimal.txt` for requirements for separation only,
+and `environment-[cpu|cuda].yml` (or `requirements.txt`) if you want to train a new model.
+### For Windows users
+Everytime you see `python3`, replace it with `python.exe`. You should always run commands from the
+Anaconda console.
+### For musicians
+If you just want to use Demucs to separate tracks, you can install it with
+```bash
+python3 -m pip install -U demucs
+```
+For bleeding edge versions, you can install directly from this repo using
+```bash
+python3 -m pip install -U git+https://github.com/facebookresearch/demucs#egg=demucs
+```
+Advanced OS support are provided on the following page, **you must read the page for your OS before posting an issues**:
+- **If you are using Windows:** [Windows support](docs/windows.md).
+- **If you are using macOS:** [macOS support](docs/mac.md).
+- **If you are using Linux:** [Linux support](docs/linux.md).
+### For machine learning scientists
+If you have anaconda installed, you can run from the root of this repository:
+```bash
+conda env update -f environment-cpu.yml  # if you don't have GPUs
+conda env update -f environment-cuda.yml # if you have GPUs
+conda activate demucs
+pip install -e .
+```
+This will create a `demucs` environment with all the dependencies installed.
+You will also need to install [soundstretch/soundtouch](https://www.surina.net/soundtouch/soundstretch.html): on macOS you can do `brew install sound-touch`,
+and on Ubuntu `sudo apt-get install soundstretch`. This is used for the
+pitch/tempo augmentation.
+### Running in Docker
+Thanks to @xserrat, there is now a Docker image definition ready for using Demucs. This can ensure all libraries are correctly installed without interfering with the host OS. See his repo [Docker Facebook Demucs](https://github.com/xserrat/docker-facebook-demucs) for more information.
+### Running from Colab
+I made a Colab to easily separate track with Demucs. Note that
+transfer speeds with Colab are a bit slow for large media files,
+but it will allow you to use Demucs without installing anything.
+[Demucs on Google Colab](https://colab.research.google.com/drive/1dC9nVxk3V_VPjUADsnFu8EiT-xnU1tGH?usp=sharing)
+### Web Demo
+Integrated to [Hugging Face Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/akhaliq/demucs)
+### Graphical Interface
+@CarlGao4 has released a GUI for Demucs: [CarlGao4/Demucs-Gui](https://github.com/CarlGao4/Demucs-Gui). Downloads for Windows and macOS is available [here](https://github.com/CarlGao4/Demucs-Gui/releases). Use [FossHub mirror](https://fosshub.com/Demucs-GUI.html) to speed up your download.
+@Anjok07 is providing a self contained GUI in [UVR (Ultimate Vocal Remover)](https://github.com/facebookresearch/demucs/issues/334) that supports Demucs.
+### Other providers
+Audiostrip is providing free online separation with Demucs on their website [https://audiostrip.co.uk/](https://audiostrip.co.uk/).
+[MVSep](https://mvsep.com/) also provides free online separation, select `Demucs3 model B` for the best quality.
+[Neutone](https://neutone.space/) provides a realtime Demucs model in their free VST/AU plugin that can be used in your favorite DAW.
+## Separating tracks
+In order to try Demucs, you can just run from any folder (as long as you properly installed it)
+```bash
+demucs PATH_TO_AUDIO_FILE_1 [PATH_TO_AUDIO_FILE_2 ...]   # for Demucs
+# If you used `pip install --user` you might need to replace demucs with python3 -m demucs
+python3 -m demucs --mp3 --mp3-bitrate BITRATE PATH_TO_AUDIO_FILE_1  # output files saved as MP3
+        # use --mp3-preset to change encoder preset, 2 for best quality, 7 for fastest
+# If your filename contain spaces don't forget to quote it !!!
+demucs "my music/my favorite track.mp3"
+# You can select different models with `-n` mdx_q is the quantized model, smaller but maybe a bit less accurate.
+demucs -n mdx_q myfile.mp3
+# If you only want to separate vocals out of an audio, use `--two-stems=vocals` (You can also set to drums or bass)
+demucs --two-stems=vocals myfile.mp3
+```
+If you have a GPU, but you run out of memory, please use `--segment SEGMENT` to reduce length of each split. `SEGMENT` should be changed to a integer describing the length of each segment in seconds.
+A segment length of at least 10 is recommended (the bigger the number is, the more memory is required, but quality may increase). Note that the Hybrid Transformer models only support a maximum segment length of 7.8 seconds.
+Creating an environment variable `PYTORCH_NO_CUDA_MEMORY_CACHING=1` is also helpful. If this still does not help, please add `-d cpu` to the command line. See the section hereafter for more details on the memory requirements for GPU acceleration.
+Separated tracks are stored in the `separated/MODEL_NAME/TRACK_NAME` folder. There you will find four stereo wav files sampled at 44.1 kHz: `drums.wav`, `bass.wav`,
+`other.wav`, `vocals.wav` (or `.mp3` if you used the `--mp3` option).
+All audio formats supported by `torchaudio` can be processed (i.e. wav, mp3, flac, ogg/vorbis on Linux/macOS, etc.). On Windows, `torchaudio` has limited support, so we rely on `ffmpeg`, which should support pretty much anything.
+Audio is resampled on the fly if necessary.
+The output will be a wav file encoded as int16.
+You can save as float32 wav files with `--float32`, or 24 bits integer wav with `--int24`.
+You can pass `--mp3` to save as mp3 instead, and set the bitrate (in kbps) with `--mp3-bitrate` (default is 320).
+It can happen that the output would need clipping, in particular due to some separation artifacts.
+Demucs will automatically rescale each output stem so as to avoid clipping. This can however break
+the relative volume between stems. If instead you prefer hard clipping, pass `--clip-mode clamp`.
+You can also try to reduce the volume of the input mixture before feeding it to Demucs.
+Other pre-trained models can be selected with the `-n` flag.
+The list of pre-trained models is:
+- `htdemucs`: first version of Hybrid Transformer Demucs. Trained on MusDB + 800 songs. Default model.
+- `htdemucs_ft`: fine-tuned version of `htdemucs`, separation will take 4 times more time
+    but might be a bit better. Same training set as `htdemucs`.
+- `htdemucs_6s`: 6 sources version of `htdemucs`, with `piano` and `guitar` being added as sources.
+    Note that the `piano` source is not working great at the moment.
+- `hdemucs_mmi`: Hybrid Demucs v3, retrained on MusDB + 800 songs.
+- `mdx`: trained only on MusDB HQ, winning model on track A at the [MDX][mdx] challenge.
+- `mdx_extra`: trained with extra training data (**including MusDB test set**), ranked 2nd on the track B
+    of the [MDX][mdx] challenge.
+- `mdx_q`, `mdx_extra_q`: quantized version of the previous models. Smaller download and storage
+    but quality can be slightly worse.
+- `SIG`: where `SIG` is a single model from the [model zoo](docs/training.md#model-zoo).
+The `--two-stems=vocals` option allows separating vocals from the rest of the accompaniment (i.e., karaoke mode).
+`vocals` can be changed to any source in the selected model.
+This will mix the files after separating the mix fully, so this won't be faster or use less memory.
+The `--shifts=SHIFTS` performs multiple predictions with random shifts (a.k.a the *shift trick*) of the input and average them. This makes prediction `SHIFTS` times
+slower. Don't use it unless you have a GPU.
+The `--overlap` option controls the amount of overlap between prediction windows. Default is 0.25 (i.e. 25%) which is probably fine.
+It can probably be reduced to 0.1 to improve a bit speed.
+The `-j` flag allow to specify a number of parallel jobs (e.g. `demucs -j 2 myfile.mp3`).
+This will multiply by the same amount the RAM used so be careful!
+### Memory requirements for GPU acceleration
+If you want to use GPU acceleration, you will need at least 3GB of RAM on your GPU for `demucs`. However, about 7GB of RAM will be required if you use the default arguments. Add `--segment SEGMENT` to change size of each split. If you only have 3GB memory, set SEGMENT to 8 (though quality may be worse if this argument is too small). Creating an environment variable `PYTORCH_NO_CUDA_MEMORY_CACHING=1` can help users with even smaller RAM such as 2GB (I separated a track that is 4 minutes but only 1.5GB is used), but this would make the separation slower.
+If you do not have enough memory on your GPU, simply add `-d cpu` to the command line to use the CPU. With Demucs, processing time should be roughly equal to 1.5 times the duration of the track.
+## Calling from another Python program
+The main function provides an `opt` parameter as a simple API. You can just pass the parsed command line as this parameter:
+```python
+# Assume that your command is `demucs --mp3 --two-stems vocals -n mdx_extra "track with space.mp3"`
+# The following codes are same as the command above:
+import demucs.separate
+demucs.separate.main(["--mp3", "--two-stems", "vocals", "-n", "mdx_extra", "track with space.mp3"])
+# Or like this
+import demucs.separate
+import shlex
+demucs.separate.main(shlex.split('--mp3 --two-stems vocals -n mdx_extra "track with space.mp3"'))
+```
+To use more complicated APIs, see [API docs](docs/api.md)
+## Training Demucs
+If you want to train (Hybrid) Demucs, please follow the [training doc](docs/training.md).
+## MDX Challenge reproduction
+In order to reproduce the results from the Track A and Track B submissions, checkout the [MDX Hybrid Demucs submission repo][mdx_submission].
+## How to cite
+```
+@inproceedings{rouard2022hybrid,
+  title={Hybrid Transformers for Music Source Separation},
+  author={Rouard, Simon and Massa, Francisco and D{\'e}fossez, Alexandre},
+  booktitle={ICASSP 23},
+  year={2023}
+}
+@inproceedings{defossez2021hybrid,
+  title={Hybrid Spectrogram and Waveform Source Separation},
+  author={D{\'e}fossez, Alexandre},
+  booktitle={Proceedings of the ISMIR 2021 Workshop on Music Source Separation},
+  year={2021}
+}
+```
+## License
+Demucs is released under the MIT license as found in the [LICENSE](LICENSE) file.
+[hybrid_paper]: https://arxiv.org/abs/2111.03600
+[waveunet]: https://github.com/f90/Wave-U-Net
+[musdb]: https://sigsep.github.io/datasets/musdb.html
+[openunmix]: https://github.com/sigsep/open-unmix-pytorch
+[mmdenselstm]: https://arxiv.org/abs/1805.02410
+[demucs_v2]: https://github.com/facebookresearch/demucs/tree/v2
+[demucs_v3]: https://github.com/facebookresearch/demucs/tree/v3
+[spleeter]: https://github.com/deezer/spleeter
+[soundcloud]: https://soundcloud.com/honualx/sets/source-separation-in-the-waveform-domain
+[d3net]: https://arxiv.org/abs/2010.01733
+[mdx]: https://www.aicrowd.com/challenges/music-demixing-challenge-ismir-2021
+[kuielab]: https://github.com/kuielab/mdx-net-submission
+[decouple]: https://arxiv.org/abs/2109.05418
+[mdx_submission]: https://github.com/adefossez/mdx21_demucs
+[bandsplit]: https://arxiv.org/abs/2209.15174
+[htdemucs]: https://arxiv.org/abs/2211.08553

app.py CHANGED Viewed

@@ -1,37 +1,37 @@
-import os
-import shutil
-import gradio as gr
-from demucs.separate import main
-def separate_stems(audio_file):
-    input_path = "input.mp3"
-    shutil.copy(audio_file, input_path)
-    output_dir = "output"
-    if os.path.exists(output_dir):
-        shutil.rmtree(output_dir)
-    os.makedirs(output_dir, exist_ok=True)
-    # Run Demucs
-    main(["-n", "htdemucs", "-o", output_dir, input_path])
-    # Build list of stems to return
-    base = os.path.splitext(os.path.basename(input_path))[0]
-    stem_path = os.path.join(output_dir, "htdemucs", base)
-    stems = [os.path.join(stem_path, f"{stem}.mp3") for stem in ["vocals", "drums", "bass", "other"]]
-    return stems
-demo = gr.Interface(
-    fn=separate_stems,
-    inputs=gr.Audio(type="filepath", label="Upload Song"),
-    outputs=[
-        gr.Audio(label="Vocals"),
-        gr.Audio(label="Drums"),
-        gr.Audio(label="Bass"),
-        gr.Audio(label="Other"),
-    ],
-    title="Demucs v4 Stem Separator",
-    description="Upload a song to separate vocals, drums, bass, and other using Facebook's Demucs model.",
-)
-demo.launch()

+import os
+import shutil
+import gradio as gr
+from demucs.separate import main
+def separate_stems(audio_file):
+    input_path = "input.mp3"
+    shutil.copy(audio_file, input_path)
+    output_dir = "output"
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    # Run Demucs
+    main(["-n", "htdemucs", "-o", output_dir, input_path])
+    # Build list of stems to return
+    base = os.path.splitext(os.path.basename(input_path))[0]
+    stem_path = os.path.join(output_dir, "htdemucs", base)
+    stems = [os.path.join(stem_path, f"{stem}.mp3") for stem in ["vocals", "drums", "bass", "other"]]
+    return stems
+demo = gr.Interface(
+    fn=separate_stems,
+    inputs=gr.Audio(type="filepath", label="Upload Song"),
+    outputs=[
+        gr.Audio(label="Vocals"),
+        gr.Audio(label="Drums"),
+        gr.Audio(label="Bass"),
+        gr.Audio(label="Other"),
+    ],
+    title="Demucs v4 Stem Separator",
+    description="Upload a song to separate vocals, drums, bass, and other using Facebook's Demucs model.",
+)
+demo.launch()

conf/config.yaml ADDED Viewed

	@@ -0,0 +1,304 @@

+defaults:
+  - _self_
+  - dset: musdb44
+  - svd: default
+  - variant: default
+  - override hydra/hydra_logging: colorlog
+  - override hydra/job_logging: colorlog
+dummy:
+dset:
+  musdb: /checkpoint/defossez/datasets/musdbhq
+  musdb_samplerate: 44100
+  use_musdb: true   # set to false to not use musdb as training data.
+  wav:  # path to custom wav dataset
+  wav2:  # second custom wav dataset
+  segment: 11
+  shift: 1
+  train_valid: false
+  full_cv: true
+  samplerate: 44100
+  channels: 2
+  normalize: true
+  metadata: ./metadata
+  sources: ['drums', 'bass', 'other', 'vocals']
+  valid_samples: # valid dataset size
+  backend: null   # if provided select torchaudio backend.
+test:
+  save: False
+  best: True
+  workers: 2
+  every: 20
+  split: true
+  shifts: 1
+  overlap: 0.25
+  sdr: true
+  metric: 'loss'  # metric used for best model selection on the valid set, can also be nsdr
+  nonhq:   # path to non hq MusDB for evaluation
+epochs: 360
+batch_size: 64
+max_batches:  # limit the number of batches per epoch, useful for debugging
+              # or if your dataset is gigantic.
+optim:
+  lr: 3e-4
+  momentum: 0.9
+  beta2: 0.999
+  loss: l1    # l1 or mse
+  optim: adam
+  weight_decay: 0
+  clip_grad: 0
+seed: 42
+debug: false
+valid_apply: true
+flag:
+save_every:
+weights: [1., 1., 1., 1.]  # weights over each source for the training/valid loss.
+augment:
+  shift_same: false
+  repitch:
+    proba: 0.2
+    max_tempo: 12
+  remix:
+    proba: 1
+    group_size: 4
+  scale:
+    proba: 1
+    min: 0.25
+    max: 1.25
+  flip: true
+continue_from:  # continue from other XP, give the XP Dora signature.
+continue_pretrained:   # signature of a pretrained XP, this cannot be a bag of models.
+pretrained_repo:   # repo for pretrained model (default is official AWS)
+continue_best: true
+continue_opt: false
+misc:
+  num_workers: 10
+  num_prints: 4
+  show: false
+  verbose: false
+# List of decay for EMA at batch or epoch level, e.g. 0.999.
+# Batch level EMA are kept on GPU for speed.
+ema:
+  epoch: []
+  batch: []
+use_train_segment: true  # to remove
+model_segment:  # override the segment parameter for the model, usually 4 times the training segment.
+model: demucs  # see demucs/train.py for the possibilities, and config for each model hereafter.
+demucs:  # see demucs/demucs.py for a detailed description
+  # Channels
+  channels: 64
+  growth: 2
+  # Main structure
+  depth: 6
+  rewrite: true
+  lstm_layers: 0
+  # Convolutions
+  kernel_size: 8
+  stride: 4
+  context: 1
+  # Activations
+  gelu: true
+  glu: true
+  # Normalization
+  norm_groups: 4
+  norm_starts: 4
+  # DConv residual branch
+  dconv_depth: 2
+  dconv_mode: 1  # 1 = branch in encoder, 2 = in decoder, 3 = in both.
+  dconv_comp: 4
+  dconv_attn: 4
+  dconv_lstm: 4
+  dconv_init: 1e-4
+  # Pre/post treatment
+  resample: true
+  normalize: false
+  # Weight init
+  rescale: 0.1
+hdemucs:  # see demucs/hdemucs.py for a detailed description
+  # Channels
+  channels: 48
+  channels_time:
+  growth: 2
+  # STFT
+  nfft: 4096
+  wiener_iters: 0
+  end_iters: 0
+  wiener_residual: false
+  cac: true
+  # Main structure
+  depth: 6
+  rewrite: true
+  hybrid: true
+  hybrid_old: false
+  # Frequency Branch
+  multi_freqs: []
+  multi_freqs_depth: 3
+  freq_emb: 0.2
+  emb_scale: 10
+  emb_smooth: true
+  # Convolutions
+  kernel_size: 8
+  stride: 4
+  time_stride: 2
+  context: 1
+  context_enc: 0
+  # normalization
+  norm_starts: 4
+  norm_groups: 4
+  # DConv residual branch
+  dconv_mode: 1
+  dconv_depth: 2
+  dconv_comp: 4
+  dconv_attn: 4
+  dconv_lstm: 4
+  dconv_init: 1e-3
+  # Weight init
+  rescale: 0.1
+# Torchaudio implementation of HDemucs
+torch_hdemucs:
+# Channels
+  channels: 48
+  growth: 2
+  # STFT
+  nfft: 4096
+  # Main structure
+  depth: 6
+  freq_emb: 0.2
+  emb_scale: 10
+  emb_smooth: true
+  # Convolutions
+  kernel_size: 8
+  stride: 4
+  time_stride: 2
+  context: 1
+  context_enc: 0
+  # normalization
+  norm_starts: 4
+  norm_groups: 4
+  # DConv residual branch
+  dconv_depth: 2
+  dconv_comp: 4
+  dconv_attn: 4
+  dconv_lstm: 4
+  dconv_init: 1e-3
+htdemucs:  # see demucs/htdemucs.py for a detailed description
+  # Channels
+  channels: 48
+  channels_time:
+  growth: 2
+  # STFT
+  nfft: 4096
+  wiener_iters: 0
+  end_iters: 0
+  wiener_residual: false
+  cac: true
+  # Main structure
+  depth: 4
+  rewrite: true
+  # Frequency Branch
+  multi_freqs: []
+  multi_freqs_depth: 3
+  freq_emb: 0.2
+  emb_scale: 10
+  emb_smooth: true
+  # Convolutions
+  kernel_size: 8
+  stride: 4
+  time_stride: 2
+  context: 1
+  context_enc: 0
+  # normalization
+  norm_starts: 4
+  norm_groups: 4
+  # DConv residual branch
+  dconv_mode: 1
+  dconv_depth: 2
+  dconv_comp: 8
+  dconv_init: 1e-3
+  # Before the Transformer
+  bottom_channels: 0
+  # CrossTransformer
+  # ------ Common to all
+  # Regular parameters
+  t_layers: 5
+  t_hidden_scale: 4.0
+  t_heads: 8
+  t_dropout: 0.0
+  t_layer_scale: True
+  t_gelu: True
+  # ------------- Positional Embedding
+  t_emb: sin
+  t_max_positions: 10000 # for the scaled embedding
+  t_max_period: 10000.0
+  t_weight_pos_embed: 1.0
+  t_cape_mean_normalize: True
+  t_cape_augment: True
+  t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
+  t_sin_random_shift: 0
+  # ------------- norm before a transformer encoder
+  t_norm_in: True
+  t_norm_in_group: False
+  # ------------- norm inside the encoder
+  t_group_norm: False
+  t_norm_first: True
+  t_norm_out: True
+  # ------------- optim
+  t_weight_decay: 0.0
+  t_lr:
+  # ------------- sparsity
+  t_sparse_self_attn: False
+  t_sparse_cross_attn: False
+  t_mask_type: diag
+  t_mask_random_seed: 42
+  t_sparse_attn_window: 400
+  t_global_window: 100
+  t_sparsity: 0.95
+  t_auto_sparsity: False
+  # Cross Encoder First (False)
+  t_cross_first: False
+  # Weight init
+  rescale: 0.1
+svd:  # see svd.py for documentation
+  penalty: 0
+  min_size: 0.1
+  dim: 1
+  niters: 2
+  powm: false
+  proba: 1
+  conv_only: false
+  convtr: false
+  bs: 1
+quant:  # quantization hyper params
+  diffq:    # diffq penalty, typically 1e-4 or 3e-4
+  qat:      # use QAT with a fixed number of bits (not as good as diffq)
+  min_size: 0.2
+  group_size: 8
+dora:
+  dir: outputs
+  exclude: ["misc.*", "slurm.*", 'test.reval', 'flag', 'dset.backend']
+slurm:
+  time: 4320
+  constraint: volta32gb
+  setup: ['module load cudnn/v8.4.1.50-cuda.11.6 NCCL/2.11.4-6-cuda.11.6 cuda/11.6']
+# Hydra config
+hydra:
+  job_logging:
+    formatters:
+      colorlog:
+        datefmt: "%m-%d %H:%M:%S"

conf/dset/aetl.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# @package _global_
+# automix dataset with Musdb, extra training data and the test set of Musdb.
+# This used even more remixes than auto_extra_test.
+dset:
+  wav: /checkpoint/defossez/datasets/aetl
+  samplerate: 44100
+  channels: 2
+epochs: 320
+max_batches: 500
+augment:
+  shift_same: true
+  scale:
+    proba: 0.
+  remix:
+    proba: 0
+  repitch:
+    proba: 0

conf/dset/auto_extra_test.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# @package _global_
+# automix dataset with Musdb, extra training data and the test set of Musdb.
+dset:
+  wav: /checkpoint/defossez/datasets/automix_extra_test2
+  samplerate: 44100
+  channels: 2
+epochs: 320
+max_batches: 500
+augment:
+  shift_same: true
+  scale:
+    proba: 0.
+  remix:
+    proba: 0
+  repitch:
+    proba: 0

conf/dset/auto_mus.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+# @package _global_
+# Automix dataset based on musdb train set.
+dset:
+  wav: /checkpoint/defossez/datasets/automix_musdb
+  samplerate: 44100
+  channels: 2
+epochs: 360
+max_batches: 300
+test:
+  every: 4
+augment:
+  shift_same: true
+  scale:
+    proba: 0.5
+  remix:
+    proba: 0
+  repitch:
+    proba: 0

conf/dset/extra44.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+# @package _global_
+# Musdb + extra tracks
+dset:
+  wav: /checkpoint/defossez/datasets/allstems_44/
+  samplerate: 44100
+  channels: 2
+epochs: 320

conf/dset/extra_mmi_goodclean.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# @package _global_
+# Musdb + extra tracks
+dset:
+  wav: /checkpoint/defossez/datasets/allstems_44/
+  wav2: /checkpoint/defossez/datasets/mmi44_goodclean
+  samplerate: 44100
+  channels: 2
+  wav2_weight: null
+  wav2_valid: false
+  valid_samples: 100
+epochs: 1200

conf/dset/extra_test.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# @package _global_
+# Musdb + extra tracks + test set from musdb.
+dset:
+  wav: /checkpoint/defossez/datasets/allstems_test_44/
+  samplerate: 44100
+  channels: 2
+epochs: 320
+max_batches: 700
+test:
+  sdr: false
+  every: 500

conf/dset/musdb44.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# @package _global_
+dset:
+  samplerate: 44100
+  channels: 2

conf/dset/sdx23_bleeding.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package _global_
+# Musdb + extra tracks
+dset:
+  wav: /shared/home/defossez/data/datasets/moisesdb23_bleeding_v1.0/
+  use_musdb: false
+  samplerate: 44100
+  channels: 2
+  backend: soundfile   # must use soundfile as some mixture would clip with sox.
+epochs: 320

conf/dset/sdx23_labelnoise.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package _global_
+# Musdb + extra tracks
+dset:
+  wav: /shared/home/defossez/data/datasets/moisesdb23_labelnoise_v1.0
+  use_musdb: false
+  samplerate: 44100
+  channels: 2
+  backend: soundfile   # must use soundfile as some mixture would clip with sox.
+epochs: 320

conf/svd/base.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+# @package _global_
+svd:
+  penalty: 0
+  min_size: 1
+  dim: 50
+  niters: 4
+  powm: false
+  proba: 1
+  conv_only: false
+  convtr: false  # ideally this should be true, but some models were trained with this to false.
+optim:
+  beta2: 0.9998

conf/svd/base2.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+# @package _global_
+svd:
+  penalty: 0
+  min_size: 1
+  dim: 100
+  niters: 4
+  powm: false
+  proba: 1
+  conv_only: false
+  convtr: true
+optim:
+  beta2: 0.9998

conf/svd/default.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ # @package _global_

conf/variant/default.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ # @package _global_

conf/variant/example.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# @package _global_
+model: hdemucs
+hdemucs:
+  channels: 32

conf/variant/finetune.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# @package _global_
+epochs: 4
+batch_size: 16
+optim:
+  lr: 0.0006
+test:
+  every: 1
+  sdr: false
+dset:
+  segment: 28
+  shift: 2
+augment:
+  scale:
+    proba: 0
+  shift_same: true
+  remix:
+    proba: 0

demucs.png ADDED Viewed

Git LFS Details

SHA256: 7f8a53c1bbaa6c0268d358cd4cb9c2f1128907758aeb10a79789f7bbf61ded95
Pointer size: 131 Bytes
Size of remote file: 339 kB

demucs/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+__version__ = "4.1.0a2"

demucs/__main__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .separate import main
+if __name__ == '__main__':
+    main()

demucs/api.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""API methods for demucs
+Classes
+-------
+`demucs.api.Separator`: The base separator class
+Functions
+---------
+`demucs.api.save_audio`: Save an audio
+`demucs.api.list_models`: Get models list
+Examples
+--------
+See the end of this module (if __name__ == "__main__")
+"""
+import subprocess
+import torch as th
+import torchaudio as ta
+from dora.log import fatal
+from pathlib import Path
+from typing import Optional, Callable, Dict, Tuple, Union
+from .apply import apply_model, _replace_dict
+from .audio import AudioFile, convert_audio, save_audio
+from .pretrained import get_model, _parse_remote_files, REMOTE_ROOT
+from .repo import RemoteRepo, LocalRepo, ModelOnlyRepo, BagOnlyRepo
+class LoadAudioError(Exception):
+    pass
+class LoadModelError(Exception):
+    pass
+class _NotProvided:
+    pass
+NotProvided = _NotProvided()
+class Separator:
+    def __init__(
+        self,
+        model: str = "htdemucs",
+        repo: Optional[Path] = None,
+        device: str = "cuda" if th.cuda.is_available() else "cpu",
+        shifts: int = 1,
+        overlap: float = 0.25,
+        split: bool = True,
+        segment: Optional[int] = None,
+        jobs: int = 0,
+        progress: bool = False,
+        callback: Optional[Callable[[dict], None]] = None,
+        callback_arg: Optional[dict] = None,
+    ):
+        """
+        `class Separator`
+        =================
+        Parameters
+        ----------
+        model: Pretrained model name or signature. Default is htdemucs.
+        repo: Folder containing all pre-trained models for use.
+        segment: Length (in seconds) of each segment (only available if `split` is `True`). If \
+            not specified, will use the command line option.
+        shifts: If > 0, will shift in time `wav` by a random amount between 0 and 0.5 sec and \
+            apply the oppositve shift to the output. This is repeated `shifts` time and all \
+            predictions are averaged. This effectively makes the model time equivariant and \
+            improves SDR by up to 0.2 points. If not specified, will use the command line option.
+        split: If True, the input will be broken down into small chunks (length set by `segment`) \
+            and predictions will be performed individually on each and concatenated. Useful for \
+            model with large memory footprint like Tasnet. If not specified, will use the command \
+            line option.
+        overlap: The overlap between the splits. If not specified, will use the command line \
+            option.
+        device (torch.device, str, or None): If provided, device on which to execute the \
+            computation, otherwise `wav.device` is assumed. When `device` is different from \
+            `wav.device`, only local computations will be on `device`, while the entire tracks \
+            will be stored on `wav.device`. If not specified, will use the command line option.
+        jobs: Number of jobs. This can increase memory usage but will be much faster when \
+            multiple cores are available. If not specified, will use the command line option.
+        callback: A function will be called when the separation of a chunk starts or finished. \
+            The argument passed to the function will be a dict. For more information, please see \
+            the Callback section.
+        callback_arg: A dict containing private parameters to be passed to callback function. For \
+            more information, please see the Callback section.
+        progress: If true, show a progress bar.
+        Callback
+        --------
+        The function will be called with only one positional parameter whose type is `dict`. The
+        `callback_arg` will be combined with information of current separation progress. The
+        progress information will override the values in `callback_arg` if same key has been used.
+        To abort the separation, raise `KeyboardInterrupt`.
+        Progress information contains several keys (These keys will always exist):
+        - `model_idx_in_bag`: The index of the submodel in `BagOfModels`. Starts from 0.
+        - `shift_idx`: The index of shifts. Starts from 0.
+        - `segment_offset`: The offset of current segment. If the number is 441000, it doesn't
+            mean that it is at the 441000 second of the audio, but the "frame" of the tensor.
+        - `state`: Could be `"start"` or `"end"`.
+        - `audio_length`: Length of the audio (in "frame" of the tensor).
+        - `models`: Count of submodels in the model.
+        """
+        self._name = model
+        self._repo = repo
+        self._load_model()
+        self.update_parameter(device=device, shifts=shifts, overlap=overlap, split=split,
+                              segment=segment, jobs=jobs, progress=progress, callback=callback,
+                              callback_arg=callback_arg)
+    def update_parameter(
+        self,
+        device: Union[str, _NotProvided] = NotProvided,
+        shifts: Union[int, _NotProvided] = NotProvided,
+        overlap: Union[float, _NotProvided] = NotProvided,
+        split: Union[bool, _NotProvided] = NotProvided,
+        segment: Optional[Union[int, _NotProvided]] = NotProvided,
+        jobs: Union[int, _NotProvided] = NotProvided,
+        progress: Union[bool, _NotProvided] = NotProvided,
+        callback: Optional[
+            Union[Callable[[dict], None], _NotProvided]
+        ] = NotProvided,
+        callback_arg: Optional[Union[dict, _NotProvided]] = NotProvided,
+    ):
+        """
+        Update the parameters of separation.
+        Parameters
+        ----------
+        segment: Length (in seconds) of each segment (only available if `split` is `True`). If \
+            not specified, will use the command line option.
+        shifts: If > 0, will shift in time `wav` by a random amount between 0 and 0.5 sec and \
+            apply the oppositve shift to the output. This is repeated `shifts` time and all \
+            predictions are averaged. This effectively makes the model time equivariant and \
+            improves SDR by up to 0.2 points. If not specified, will use the command line option.
+        split: If True, the input will be broken down into small chunks (length set by `segment`) \
+            and predictions will be performed individually on each and concatenated. Useful for \
+            model with large memory footprint like Tasnet. If not specified, will use the command \
+            line option.
+        overlap: The overlap between the splits. If not specified, will use the command line \
+            option.
+        device (torch.device, str, or None): If provided, device on which to execute the \
+            computation, otherwise `wav.device` is assumed. When `device` is different from \
+            `wav.device`, only local computations will be on `device`, while the entire tracks \
+            will be stored on `wav.device`. If not specified, will use the command line option.
+        jobs: Number of jobs. This can increase memory usage but will be much faster when \
+            multiple cores are available. If not specified, will use the command line option.
+        callback: A function will be called when the separation of a chunk starts or finished. \
+            The argument passed to the function will be a dict. For more information, please see \
+            the Callback section.
+        callback_arg: A dict containing private parameters to be passed to callback function. For \
+            more information, please see the Callback section.
+        progress: If true, show a progress bar.
+        Callback
+        --------
+        The function will be called with only one positional parameter whose type is `dict`. The
+        `callback_arg` will be combined with information of current separation progress. The
+        progress information will override the values in `callback_arg` if same key has been used.
+        To abort the separation, raise `KeyboardInterrupt`.
+        Progress information contains several keys (These keys will always exist):
+        - `model_idx_in_bag`: The index of the submodel in `BagOfModels`. Starts from 0.
+        - `shift_idx`: The index of shifts. Starts from 0.
+        - `segment_offset`: The offset of current segment. If the number is 441000, it doesn't
+            mean that it is at the 441000 second of the audio, but the "frame" of the tensor.
+        - `state`: Could be `"start"` or `"end"`.
+        - `audio_length`: Length of the audio (in "frame" of the tensor).
+        - `models`: Count of submodels in the model.
+        """
+        if not isinstance(device, _NotProvided):
+            self._device = device
+        if not isinstance(shifts, _NotProvided):
+            self._shifts = shifts
+        if not isinstance(overlap, _NotProvided):
+            self._overlap = overlap
+        if not isinstance(split, _NotProvided):
+            self._split = split
+        if not isinstance(segment, _NotProvided):
+            self._segment = segment
+        if not isinstance(jobs, _NotProvided):
+            self._jobs = jobs
+        if not isinstance(progress, _NotProvided):
+            self._progress = progress
+        if not isinstance(callback, _NotProvided):
+            self._callback = callback
+        if not isinstance(callback_arg, _NotProvided):
+            self._callback_arg = callback_arg
+    def _load_model(self):
+        self._model = get_model(name=self._name, repo=self._repo)
+        if self._model is None:
+            raise LoadModelError("Failed to load model")
+        self._audio_channels = self._model.audio_channels
+        self._samplerate = self._model.samplerate
+    def _load_audio(self, track: Path):
+        errors = {}
+        wav = None
+        try:
+            wav = AudioFile(track).read(streams=0, samplerate=self._samplerate,
+                                        channels=self._audio_channels)
+        except FileNotFoundError:
+            errors["ffmpeg"] = "FFmpeg is not installed."
+        except subprocess.CalledProcessError:
+            errors["ffmpeg"] = "FFmpeg could not read the file."
+        if wav is None:
+            try:
+                wav, sr = ta.load(str(track))
+            except RuntimeError as err:
+                errors["torchaudio"] = err.args[0]
+            else:
+                wav = convert_audio(wav, sr, self._samplerate, self._audio_channels)
+        if wav is None:
+            raise LoadAudioError(
+                "\n".join(
+                    "When trying to load using {}, got the following error: {}".format(
+                        backend, error
+                    )
+                    for backend, error in errors.items()
+                )
+            )
+        return wav
+    def separate_tensor(
+        self, wav: th.Tensor, sr: Optional[int] = None
+    ) -> Tuple[th.Tensor, Dict[str, th.Tensor]]:
+        """
+        Separate a loaded tensor.
+        Parameters
+        ----------
+        wav: Waveform of the audio. Should have 2 dimensions, the first is each audio channel, \
+            while the second is the waveform of each channel. Type should be float32. \
+            e.g. `tuple(wav.shape) == (2, 884000)` means the audio has 2 channels.
+        sr: Sample rate of the original audio, the wave will be resampled if it doesn't match the \
+            model.
+        Returns
+        -------
+        A tuple, whose first element is the original wave and second element is a dict, whose keys
+        are the name of stems and values are separated waves. The original wave will have already
+        been resampled.
+        Notes
+        -----
+        Use this function with cautiousness. This function does not provide data verifying.
+        """
+        if sr is not None and sr != self.samplerate:
+            wav = convert_audio(wav, sr, self._samplerate, self._audio_channels)
+        ref = wav.mean(0)
+        wav -= ref.mean()
+        wav /= ref.std() + 1e-8
+        out = apply_model(
+                self._model,
+                wav[None],
+                segment=self._segment,
+                shifts=self._shifts,
+                split=self._split,
+                overlap=self._overlap,
+                device=self._device,
+                num_workers=self._jobs,
+                callback=self._callback,
+                callback_arg=_replace_dict(
+                    self._callback_arg, ("audio_length", wav.shape[1])
+                ),
+                progress=self._progress,
+            )
+        if out is None:
+            raise KeyboardInterrupt
+        out *= ref.std() + 1e-8
+        out += ref.mean()
+        wav *= ref.std() + 1e-8
+        wav += ref.mean()
+        return (wav, dict(zip(self._model.sources, out[0])))
+    def separate_audio_file(self, file: Path):
+        """
+        Separate an audio file. The method will automatically read the file.
+        Parameters
+        ----------
+        wav: Path of the file to be separated.
+        Returns
+        -------
+        A tuple, whose first element is the original wave and second element is a dict, whose keys
+        are the name of stems and values are separated waves. The original wave will have already
+        been resampled.
+        """
+        return self.separate_tensor(self._load_audio(file), self.samplerate)
+    @property
+    def samplerate(self):
+        return self._samplerate
+    @property
+    def audio_channels(self):
+        return self._audio_channels
+    @property
+    def model(self):
+        return self._model
+def list_models(repo: Optional[Path] = None) -> Dict[str, Dict[str, Union[str, Path]]]:
+    """
+    List the available models. Please remember that not all the returned models can be
+    successfully loaded.
+    Parameters
+    ----------
+    repo: The repo whose models are to be listed.
+    Returns
+    -------
+    A dict with two keys ("single" for single models and "bag" for bag of models). The values are
+    lists whose components are strs.
+    """
+    model_repo: ModelOnlyRepo
+    if repo is None:
+        models = _parse_remote_files(REMOTE_ROOT / 'files.txt')
+        model_repo = RemoteRepo(models)
+        bag_repo = BagOnlyRepo(REMOTE_ROOT, model_repo)
+    else:
+        if not repo.is_dir():
+            fatal(f"{repo} must exist and be a directory.")
+        model_repo = LocalRepo(repo)
+        bag_repo = BagOnlyRepo(repo, model_repo)
+    return {"single": model_repo.list_model(), "bag": bag_repo.list_model()}
+if __name__ == "__main__":
+    # Test API functions
+    # two-stem not supported
+    from .separate import get_parser
+    args = get_parser().parse_args()
+    separator = Separator(
+        model=args.name,
+        repo=args.repo,
+        device=args.device,
+        shifts=args.shifts,
+        overlap=args.overlap,
+        split=args.split,
+        segment=args.segment,
+        jobs=args.jobs,
+        callback=print
+    )
+    out = args.out / args.name
+    out.mkdir(parents=True, exist_ok=True)
+    for file in args.tracks:
+        separated = separator.separate_audio_file(file)[1]
+        if args.mp3:
+            ext = "mp3"
+        elif args.flac:
+            ext = "flac"
+        else:
+            ext = "wav"
+        kwargs = {
+            "samplerate": separator.samplerate,
+            "bitrate": args.mp3_bitrate,
+            "clip": args.clip_mode,
+            "as_float": args.float32,
+            "bits_per_sample": 24 if args.int24 else 16,
+        }
+        for stem, source in separated.items():
+            stem = out / args.filename.format(
+                track=Path(file).name.rsplit(".", 1)[0],
+                trackext=Path(file).name.rsplit(".", 1)[-1],
+                stem=stem,
+                ext=ext,
+            )
+            stem.parent.mkdir(parents=True, exist_ok=True)
+            save_audio(source, str(stem), **kwargs)

demucs/apply.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Code to apply a model to a mix. It will handle chunking with overlaps and
+inteprolation between chunks, as well as the "shift trick".
+"""
+from concurrent.futures import ThreadPoolExecutor
+import copy
+import random
+from threading import Lock
+import typing as tp
+import torch as th
+from torch import nn
+from torch.nn import functional as F
+import tqdm
+from .demucs import Demucs
+from .hdemucs import HDemucs
+from .htdemucs import HTDemucs
+from .utils import center_trim, DummyPoolExecutor
+Model = tp.Union[Demucs, HDemucs, HTDemucs]
+class BagOfModels(nn.Module):
+    def __init__(self, models: tp.List[Model],
+                 weights: tp.Optional[tp.List[tp.List[float]]] = None,
+                 segment: tp.Optional[float] = None):
+        """
+        Represents a bag of models with specific weights.
+        You should call `apply_model` rather than calling directly the forward here for
+        optimal performance.
+        Args:
+            models (list[nn.Module]): list of Demucs/HDemucs models.
+            weights (list[list[float]]): list of weights. If None, assumed to
+                be all ones, otherwise it should be a list of N list (N number of models),
+                each containing S floats (S number of sources).
+            segment (None or float): overrides the `segment` attribute of each model
+                (this is performed inplace, be careful is you reuse the models passed).
+        """
+        super().__init__()
+        assert len(models) > 0
+        first = models[0]
+        for other in models:
+            assert other.sources == first.sources
+            assert other.samplerate == first.samplerate
+            assert other.audio_channels == first.audio_channels
+            if segment is not None:
+                if not isinstance(other, HTDemucs) and segment > other.segment:
+                    other.segment = segment
+        self.audio_channels = first.audio_channels
+        self.samplerate = first.samplerate
+        self.sources = first.sources
+        self.models = nn.ModuleList(models)
+        if weights is None:
+            weights = [[1. for _ in first.sources] for _ in models]
+        else:
+            assert len(weights) == len(models)
+            for weight in weights:
+                assert len(weight) == len(first.sources)
+        self.weights = weights
+    @property
+    def max_allowed_segment(self) -> float:
+        max_allowed_segment = float('inf')
+        for model in self.models:
+            if isinstance(model, HTDemucs):
+                max_allowed_segment = min(max_allowed_segment, float(model.segment))
+        return max_allowed_segment
+    def forward(self, x):
+        raise NotImplementedError("Call `apply_model` on this.")
+class TensorChunk:
+    def __init__(self, tensor, offset=0, length=None):
+        total_length = tensor.shape[-1]
+        assert offset >= 0
+        assert offset < total_length
+        if length is None:
+            length = total_length - offset
+        else:
+            length = min(total_length - offset, length)
+        if isinstance(tensor, TensorChunk):
+            self.tensor = tensor.tensor
+            self.offset = offset + tensor.offset
+        else:
+            self.tensor = tensor
+            self.offset = offset
+        self.length = length
+        self.device = tensor.device
+    @property
+    def shape(self):
+        shape = list(self.tensor.shape)
+        shape[-1] = self.length
+        return shape
+    def padded(self, target_length):
+        delta = target_length - self.length
+        total_length = self.tensor.shape[-1]
+        assert delta >= 0
+        start = self.offset - delta // 2
+        end = start + target_length
+        correct_start = max(0, start)
+        correct_end = min(total_length, end)
+        pad_left = correct_start - start
+        pad_right = end - correct_end
+        out = F.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right))
+        assert out.shape[-1] == target_length
+        return out
+def tensor_chunk(tensor_or_chunk):
+    if isinstance(tensor_or_chunk, TensorChunk):
+        return tensor_or_chunk
+    else:
+        assert isinstance(tensor_or_chunk, th.Tensor)
+        return TensorChunk(tensor_or_chunk)
+def _replace_dict(_dict: tp.Optional[dict], *subs: tp.Tuple[tp.Hashable, tp.Any]) -> dict:
+    if _dict is None:
+        _dict = {}
+    else:
+        _dict = copy.copy(_dict)
+    for key, value in subs:
+        _dict[key] = value
+    return _dict
+def apply_model(model: tp.Union[BagOfModels, Model],
+                mix: tp.Union[th.Tensor, TensorChunk],
+                shifts: int = 1, split: bool = True,
+                overlap: float = 0.25, transition_power: float = 1.,
+                progress: bool = False, device=None,
+                num_workers: int = 0, segment: tp.Optional[float] = None,
+                pool=None, lock=None,
+                callback: tp.Optional[tp.Callable[[dict], None]] = None,
+                callback_arg: tp.Optional[dict] = None) -> th.Tensor:
+    """
+    Apply model to a given mixture.
+    Args:
+        shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec
+            and apply the oppositve shift to the output. This is repeated `shifts` time and
+            all predictions are averaged. This effectively makes the model time equivariant
+            and improves SDR by up to 0.2 points.
+        split (bool): if True, the input will be broken down in 8 seconds extracts
+            and predictions will be performed individually on each and concatenated.
+            Useful for model with large memory footprint like Tasnet.
+        progress (bool): if True, show a progress bar (requires split=True)
+        device (torch.device, str, or None): if provided, device on which to
+            execute the computation, otherwise `mix.device` is assumed.
+            When `device` is different from `mix.device`, only local computations will
+            be on `device`, while the entire tracks will be stored on `mix.device`.
+        num_workers (int): if non zero, device is 'cpu', how many threads to
+            use in parallel.
+        segment (float or None): override the model segment parameter.
+    """
+    if device is None:
+        device = mix.device
+    else:
+        device = th.device(device)
+    if pool is None:
+        if num_workers > 0 and device.type == 'cpu':
+            pool = ThreadPoolExecutor(num_workers)
+        else:
+            pool = DummyPoolExecutor()
+    if lock is None:
+        lock = Lock()
+    callback_arg = _replace_dict(
+        callback_arg, *{"model_idx_in_bag": 0, "shift_idx": 0, "segment_offset": 0}.items()
+    )
+    kwargs: tp.Dict[str, tp.Any] = {
+        'shifts': shifts,
+        'split': split,
+        'overlap': overlap,
+        'transition_power': transition_power,
+        'progress': progress,
+        'device': device,
+        'pool': pool,
+        'segment': segment,
+        'lock': lock,
+    }
+    out: tp.Union[float, th.Tensor]
+    res: tp.Union[float, th.Tensor]
+    if isinstance(model, BagOfModels):
+        # Special treatment for bag of model.
+        # We explicitely apply multiple times `apply_model` so that the random shifts
+        # are different for each model.
+        estimates: tp.Union[float, th.Tensor] = 0.
+        totals = [0.] * len(model.sources)
+        callback_arg["models"] = len(model.models)
+        for sub_model, model_weights in zip(model.models, model.weights):
+            kwargs["callback"] = ((
+                    lambda d, i=callback_arg["model_idx_in_bag"]: callback(
+                        _replace_dict(d, ("model_idx_in_bag", i))) if callback else None)
+            )
+            original_model_device = next(iter(sub_model.parameters())).device
+            sub_model.to(device)
+            res = apply_model(sub_model, mix, **kwargs, callback_arg=callback_arg)
+            out = res
+            sub_model.to(original_model_device)
+            for k, inst_weight in enumerate(model_weights):
+                out[:, k, :, :] *= inst_weight
+                totals[k] += inst_weight
+            estimates += out
+            del out
+            callback_arg["model_idx_in_bag"] += 1
+        assert isinstance(estimates, th.Tensor)
+        for k in range(estimates.shape[1]):
+            estimates[:, k, :, :] /= totals[k]
+        return estimates
+    if "models" not in callback_arg:
+        callback_arg["models"] = 1
+    model.to(device)
+    model.eval()
+    assert transition_power >= 1, "transition_power < 1 leads to weird behavior."
+    batch, channels, length = mix.shape
+    if shifts:
+        kwargs['shifts'] = 0
+        max_shift = int(0.5 * model.samplerate)
+        mix = tensor_chunk(mix)
+        assert isinstance(mix, TensorChunk)
+        padded_mix = mix.padded(length + 2 * max_shift)
+        out = 0.
+        for shift_idx in range(shifts):
+            offset = random.randint(0, max_shift)
+            shifted = TensorChunk(padded_mix, offset, length + max_shift - offset)
+            kwargs["callback"] = (
+                    (lambda d, i=shift_idx: callback(_replace_dict(d, ("shift_idx", i)))
+                     if callback else None)
+                )
+            res = apply_model(model, shifted, **kwargs, callback_arg=callback_arg)
+            shifted_out = res
+            out += shifted_out[..., max_shift - offset:]
+        out /= shifts
+        assert isinstance(out, th.Tensor)
+        return out
+    elif split:
+        kwargs['split'] = False
+        out = th.zeros(batch, len(model.sources), channels, length, device=mix.device)
+        sum_weight = th.zeros(length, device=mix.device)
+        if segment is None:
+            segment = model.segment
+        assert segment is not None and segment > 0.
+        segment_length: int = int(model.samplerate * segment)
+        stride = int((1 - overlap) * segment_length)
+        offsets = range(0, length, stride)
+        scale = float(format(stride / model.samplerate, ".2f"))
+        # We start from a triangle shaped weight, with maximal weight in the middle
+        # of the segment. Then we normalize and take to the power `transition_power`.
+        # Large values of transition power will lead to sharper transitions.
+        weight = th.cat([th.arange(1, segment_length // 2 + 1, device=device),
+                         th.arange(segment_length - segment_length // 2, 0, -1, device=device)])
+        assert len(weight) == segment_length
+        # If the overlap < 50%, this will translate to linear transition when
+        # transition_power is 1.
+        weight = (weight / weight.max())**transition_power
+        futures = []
+        for offset in offsets:
+            chunk = TensorChunk(mix, offset, segment_length)
+            future = pool.submit(apply_model, model, chunk, **kwargs, callback_arg=callback_arg,
+                                 callback=(lambda d, i=offset:
+                                           callback(_replace_dict(d, ("segment_offset", i)))
+                                           if callback else None))
+            futures.append((future, offset))
+            offset += segment_length
+        if progress:
+            futures = tqdm.tqdm(futures, unit_scale=scale, ncols=120, unit='seconds')
+        for future, offset in futures:
+            try:
+                chunk_out = future.result()  # type: th.Tensor
+            except Exception:
+                pool.shutdown(wait=True, cancel_futures=True)
+                raise
+            chunk_length = chunk_out.shape[-1]
+            out[..., offset:offset + segment_length] += (
+                weight[:chunk_length] * chunk_out).to(mix.device)
+            sum_weight[offset:offset + segment_length] += weight[:chunk_length].to(mix.device)
+        assert sum_weight.min() > 0
+        out /= sum_weight
+        assert isinstance(out, th.Tensor)
+        return out
+    else:
+        valid_length: int
+        if isinstance(model, HTDemucs) and segment is not None:
+            valid_length = int(segment * model.samplerate)
+        elif hasattr(model, 'valid_length'):
+            valid_length = model.valid_length(length)  # type: ignore
+        else:
+            valid_length = length
+        mix = tensor_chunk(mix)
+        assert isinstance(mix, TensorChunk)
+        padded_mix = mix.padded(valid_length).to(device)
+        with lock:
+            if callback is not None:
+                callback(_replace_dict(callback_arg, ("state", "start")))  # type: ignore
+        with th.no_grad():
+            out = model(padded_mix)
+        with lock:
+            if callback is not None:
+                callback(_replace_dict(callback_arg, ("state", "end")))  # type: ignore
+        assert isinstance(out, th.Tensor)
+        return center_trim(out, length)

demucs/audio.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import subprocess as sp
+from pathlib import Path
+import lameenc
+import julius
+import numpy as np
+import torch
+import torchaudio as ta
+import typing as tp
+from .utils import temp_filenames
+def _read_info(path):
+    stdout_data = sp.check_output([
+        'ffprobe', "-loglevel", "panic",
+        str(path), '-print_format', 'json', '-show_format', '-show_streams'
+    ])
+    return json.loads(stdout_data.decode('utf-8'))
+class AudioFile:
+    """
+    Allows to read audio from any format supported by ffmpeg, as well as resampling or
+    converting to mono on the fly. See :method:`read` for more details.
+    """
+    def __init__(self, path: Path):
+        self.path = Path(path)
+        self._info = None
+    def __repr__(self):
+        features = [("path", self.path)]
+        features.append(("samplerate", self.samplerate()))
+        features.append(("channels", self.channels()))
+        features.append(("streams", len(self)))
+        features_str = ", ".join(f"{name}={value}" for name, value in features)
+        return f"AudioFile({features_str})"
+    @property
+    def info(self):
+        if self._info is None:
+            self._info = _read_info(self.path)
+        return self._info
+    @property
+    def duration(self):
+        return float(self.info['format']['duration'])
+    @property
+    def _audio_streams(self):
+        return [
+            index for index, stream in enumerate(self.info["streams"])
+            if stream["codec_type"] == "audio"
+        ]
+    def __len__(self):
+        return len(self._audio_streams)
+    def channels(self, stream=0):
+        return int(self.info['streams'][self._audio_streams[stream]]['channels'])
+    def samplerate(self, stream=0):
+        return int(self.info['streams'][self._audio_streams[stream]]['sample_rate'])
+    def read(self,
+             seek_time=None,
+             duration=None,
+             streams=slice(None),
+             samplerate=None,
+             channels=None):
+        """
+        Slightly more efficient implementation than stempeg,
+        in particular, this will extract all stems at once
+        rather than having to loop over one file multiple times
+        for each stream.
+        Args:
+            seek_time (float):  seek time in seconds or None if no seeking is needed.
+            duration (float): duration in seconds to extract or None to extract until the end.
+            streams (slice, int or list): streams to extract, can be a single int, a list or
+                a slice. If it is a slice or list, the output will be of size [S, C, T]
+                with S the number of streams, C the number of channels and T the number of samples.
+                If it is an int, the output will be [C, T].
+            samplerate (int): if provided, will resample on the fly. If None, no resampling will
+                be done. Original sampling rate can be obtained with :method:`samplerate`.
+            channels (int): if 1, will convert to mono. We do not rely on ffmpeg for that
+                as ffmpeg automatically scale by +3dB to conserve volume when playing on speakers.
+                See https://sound.stackexchange.com/a/42710.
+                Our definition of mono is simply the average of the two channels. Any other
+                value will be ignored.
+        """
+        streams = np.array(range(len(self)))[streams]
+        single = not isinstance(streams, np.ndarray)
+        if single:
+            streams = [streams]
+        if duration is None:
+            target_size = None
+            query_duration = None
+        else:
+            target_size = int((samplerate or self.samplerate()) * duration)
+            query_duration = float((target_size + 1) / (samplerate or self.samplerate()))
+        with temp_filenames(len(streams)) as filenames:
+            command = ['ffmpeg', '-y']
+            command += ['-loglevel', 'panic']
+            if seek_time:
+                command += ['-ss', str(seek_time)]
+            command += ['-i', str(self.path)]
+            for stream, filename in zip(streams, filenames):
+                command += ['-map', f'0:{self._audio_streams[stream]}']
+                if query_duration is not None:
+                    command += ['-t', str(query_duration)]
+                command += ['-threads', '1']
+                command += ['-f', 'f32le']
+                if samplerate is not None:
+                    command += ['-ar', str(samplerate)]
+                command += [filename]
+            sp.run(command, check=True)
+            wavs = []
+            for filename in filenames:
+                wav = np.fromfile(filename, dtype=np.float32)
+                wav = torch.from_numpy(wav)
+                wav = wav.view(-1, self.channels()).t()
+                if channels is not None:
+                    wav = convert_audio_channels(wav, channels)
+                if target_size is not None:
+                    wav = wav[..., :target_size]
+                wavs.append(wav)
+        wav = torch.stack(wavs, dim=0)
+        if single:
+            wav = wav[0]
+        return wav
+def convert_audio_channels(wav, channels=2):
+    """Convert audio to the given number of channels."""
+    *shape, src_channels, length = wav.shape
+    if src_channels == channels:
+        pass
+    elif channels == 1:
+        # Case 1:
+        # The caller asked 1-channel audio, but the stream have multiple
+        # channels, downmix all channels.
+        wav = wav.mean(dim=-2, keepdim=True)
+    elif src_channels == 1:
+        # Case 2:
+        # The caller asked for multiple channels, but the input file have
+        # one single channel, replicate the audio over all channels.
+        wav = wav.expand(*shape, channels, length)
+    elif src_channels >= channels:
+        # Case 3:
+        # The caller asked for multiple channels, and the input file have
+        # more channels than requested. In that case return the first channels.
+        wav = wav[..., :channels, :]
+    else:
+        # Case 4: What is a reasonable choice here?
+        raise ValueError('The audio file has less channels than requested but is not mono.')
+    return wav
+def convert_audio(wav, from_samplerate, to_samplerate, channels) -> torch.Tensor:
+    """Convert audio from a given samplerate to a target one and target number of channels."""
+    wav = convert_audio_channels(wav, channels)
+    return julius.resample_frac(wav, from_samplerate, to_samplerate)
+def i16_pcm(wav):
+    """Convert audio to 16 bits integer PCM format."""
+    if wav.dtype.is_floating_point:
+        return (wav.clamp_(-1, 1) * (2**15 - 1)).short()
+    else:
+        return wav
+def f32_pcm(wav):
+    """Convert audio to float 32 bits PCM format."""
+    if wav.dtype.is_floating_point:
+        return wav
+    else:
+        return wav.float() / (2**15 - 1)
+def as_dtype_pcm(wav, dtype):
+    """Convert audio to either f32 pcm or i16 pcm depending on the given dtype."""
+    if wav.dtype.is_floating_point:
+        return f32_pcm(wav)
+    else:
+        return i16_pcm(wav)
+def encode_mp3(wav, path, samplerate=44100, bitrate=320, quality=2, verbose=False):
+    """Save given audio as mp3. This should work on all OSes."""
+    C, T = wav.shape
+    wav = i16_pcm(wav)
+    encoder = lameenc.Encoder()
+    encoder.set_bit_rate(bitrate)
+    encoder.set_in_sample_rate(samplerate)
+    encoder.set_channels(C)
+    encoder.set_quality(quality)  # 2-highest, 7-fastest
+    if not verbose:
+        encoder.silence()
+    wav = wav.data.cpu()
+    wav = wav.transpose(0, 1).numpy()
+    mp3_data = encoder.encode(wav.tobytes())
+    mp3_data += encoder.flush()
+    with open(path, "wb") as f:
+        f.write(mp3_data)
+def prevent_clip(wav, mode='rescale'):
+    """
+    different strategies for avoiding raw clipping.
+    """
+    if mode is None or mode == 'none':
+        return wav
+    assert wav.dtype.is_floating_point, "too late for clipping"
+    if mode == 'rescale':
+        wav = wav / max(1.01 * wav.abs().max(), 1)
+    elif mode == 'clamp':
+        wav = wav.clamp(-0.99, 0.99)
+    elif mode == 'tanh':
+        wav = torch.tanh(wav)
+    else:
+        raise ValueError(f"Invalid mode {mode}")
+    return wav
+def save_audio(wav: torch.Tensor,
+               path: tp.Union[str, Path],
+               samplerate: int,
+               bitrate: int = 320,
+               clip: tp.Literal["rescale", "clamp", "tanh", "none"] = 'rescale',
+               bits_per_sample: tp.Literal[16, 24, 32] = 16,
+               as_float: bool = False,
+               preset: tp.Literal[2, 3, 4, 5, 6, 7] = 2):
+    """Save audio file, automatically preventing clipping if necessary
+    based on the given `clip` strategy. If the path ends in `.mp3`, this
+    will save as mp3 with the given `bitrate`. Use `preset` to set mp3 quality:
+    2 for highest quality, 7 for fastest speed
+    """
+    wav = prevent_clip(wav, mode=clip)
+    path = Path(path)
+    suffix = path.suffix.lower()
+    if suffix == ".mp3":
+        encode_mp3(wav, path, samplerate, bitrate, preset, verbose=True)
+    elif suffix == ".wav":
+        if as_float:
+            bits_per_sample = 32
+            encoding = 'PCM_F'
+        else:
+            encoding = 'PCM_S'
+        ta.save(str(path), wav, sample_rate=samplerate,
+                encoding=encoding, bits_per_sample=bits_per_sample)
+    elif suffix == ".flac":
+        ta.save(str(path), wav, sample_rate=samplerate, bits_per_sample=bits_per_sample)
+    else:
+        raise ValueError(f"Invalid suffix for path: {suffix}")

demucs/augment.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Data augmentations.
+"""
+import random
+import torch as th
+from torch import nn
+class Shift(nn.Module):
+    """
+    Randomly shift audio in time by up to `shift` samples.
+    """
+    def __init__(self, shift=8192, same=False):
+        super().__init__()
+        self.shift = shift
+        self.same = same
+    def forward(self, wav):
+        batch, sources, channels, time = wav.size()
+        length = time - self.shift
+        if self.shift > 0:
+            if not self.training:
+                wav = wav[..., :length]
+            else:
+                srcs = 1 if self.same else sources
+                offsets = th.randint(self.shift, [batch, srcs, 1, 1], device=wav.device)
+                offsets = offsets.expand(-1, sources, channels, -1)
+                indexes = th.arange(length, device=wav.device)
+                wav = wav.gather(3, indexes + offsets)
+        return wav
+class FlipChannels(nn.Module):
+    """
+    Flip left-right channels.
+    """
+    def forward(self, wav):
+        batch, sources, channels, time = wav.size()
+        if self.training and wav.size(2) == 2:
+            left = th.randint(2, (batch, sources, 1, 1), device=wav.device)
+            left = left.expand(-1, -1, -1, time)
+            right = 1 - left
+            wav = th.cat([wav.gather(2, left), wav.gather(2, right)], dim=2)
+        return wav
+class FlipSign(nn.Module):
+    """
+    Random sign flip.
+    """
+    def forward(self, wav):
+        batch, sources, channels, time = wav.size()
+        if self.training:
+            signs = th.randint(2, (batch, sources, 1, 1), device=wav.device, dtype=th.float32)
+            wav = wav * (2 * signs - 1)
+        return wav
+class Remix(nn.Module):
+    """
+    Shuffle sources to make new mixes.
+    """
+    def __init__(self, proba=1, group_size=4):
+        """
+        Shuffle sources within one batch.
+        Each batch is divided into groups of size `group_size` and shuffling is done within
+        each group separatly. This allow to keep the same probability distribution no matter
+        the number of GPUs. Without this grouping, using more GPUs would lead to a higher
+        probability of keeping two sources from the same track together which can impact
+        performance.
+        """
+        super().__init__()
+        self.proba = proba
+        self.group_size = group_size
+    def forward(self, wav):
+        batch, streams, channels, time = wav.size()
+        device = wav.device
+        if self.training and random.random() < self.proba:
+            group_size = self.group_size or batch
+            if batch % group_size != 0:
+                raise ValueError(f"Batch size {batch} must be divisible by group size {group_size}")
+            groups = batch // group_size
+            wav = wav.view(groups, group_size, streams, channels, time)
+            permutations = th.argsort(th.rand(groups, group_size, streams, 1, 1, device=device),
+                                      dim=1)
+            wav = wav.gather(1, permutations.expand(-1, -1, -1, channels, time))
+            wav = wav.view(batch, streams, channels, time)
+        return wav
+class Scale(nn.Module):
+    def __init__(self, proba=1., min=0.25, max=1.25):
+        super().__init__()
+        self.proba = proba
+        self.min = min
+        self.max = max
+    def forward(self, wav):
+        batch, streams, channels, time = wav.size()
+        device = wav.device
+        if self.training and random.random() < self.proba:
+            scales = th.empty(batch, streams, 1, 1, device=device).uniform_(self.min, self.max)
+            wav *= scales
+        return wav

demucs/demucs.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import typing as tp
+import julius
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .states import capture_init
+from .utils import center_trim, unfold
+from .transformer import LayerScale
+class BLSTM(nn.Module):
+    """
+    BiLSTM with same hidden units as input dim.
+    If `max_steps` is not None, input will be splitting in overlapping
+    chunks and the LSTM applied separately on each chunk.
+    """
+    def __init__(self, dim, layers=1, max_steps=None, skip=False):
+        super().__init__()
+        assert max_steps is None or max_steps % 4 == 0
+        self.max_steps = max_steps
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+        self.skip = skip
+    def forward(self, x):
+        B, C, T = x.shape
+        y = x
+        framed = False
+        if self.max_steps is not None and T > self.max_steps:
+            width = self.max_steps
+            stride = width // 2
+            frames = unfold(x, width, stride)
+            nframes = frames.shape[2]
+            framed = True
+            x = frames.permute(0, 2, 1, 3).reshape(-1, C, width)
+        x = x.permute(2, 0, 1)
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        if framed:
+            out = []
+            frames = x.reshape(B, -1, C, width)
+            limit = stride // 2
+            for k in range(nframes):
+                if k == 0:
+                    out.append(frames[:, k, :, :-limit])
+                elif k == nframes - 1:
+                    out.append(frames[:, k, :, limit:])
+                else:
+                    out.append(frames[:, k, :, limit:-limit])
+            out = torch.cat(out, -1)
+            out = out[..., :T]
+            x = out
+        if self.skip:
+            x = x + y
+        return x
+def rescale_conv(conv, reference):
+    """Rescale initial weight scale. It is unclear why it helps but it certainly does.
+    """
+    std = conv.weight.std().detach()
+    scale = (std / reference)**0.5
+    conv.weight.data /= scale
+    if conv.bias is not None:
+        conv.bias.data /= scale
+def rescale_module(module, reference):
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d)):
+            rescale_conv(sub, reference)
+class DConv(nn.Module):
+    """
+    New residual branches in each encoder layer.
+    This alternates dilated convolutions, potentially with LSTMs and attention.
+    Also before entering each residual branch, dimension is projected on a smaller subspace,
+    e.g. of dim `channels // compress`.
+    """
+    def __init__(self, channels: int, compress: float = 4, depth: int = 2, init: float = 1e-4,
+                 norm=True, attn=False, heads=4, ndecay=4, lstm=False, gelu=True,
+                 kernel=3, dilate=True):
+        """
+        Args:
+            channels: input/output channels for residual branch.
+            compress: amount of channel compression inside the branch.
+            depth: number of layers in the residual branch. Each layer has its own
+                projection, and potentially LSTM and attention.
+            init: initial scale for LayerNorm.
+            norm: use GroupNorm.
+            attn: use LocalAttention.
+            heads: number of heads for the LocalAttention.
+            ndecay: number of decay controls in the LocalAttention.
+            lstm: use LSTM.
+            gelu: Use GELU activation.
+            kernel: kernel size for the (dilated) convolutions.
+            dilate: if true, use dilation, increasing with the depth.
+        """
+        super().__init__()
+        assert kernel % 2 == 1
+        self.channels = channels
+        self.compress = compress
+        self.depth = abs(depth)
+        dilate = depth > 0
+        norm_fn: tp.Callable[[int], nn.Module]
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(1, d)  # noqa
+        hidden = int(channels / compress)
+        act: tp.Type[nn.Module]
+        if gelu:
+            act = nn.GELU
+        else:
+            act = nn.ReLU
+        self.layers = nn.ModuleList([])
+        for d in range(self.depth):
+            dilation = 2 ** d if dilate else 1
+            padding = dilation * (kernel // 2)
+            mods = [
+                nn.Conv1d(channels, hidden, kernel, dilation=dilation, padding=padding),
+                norm_fn(hidden), act(),
+                nn.Conv1d(hidden, 2 * channels, 1),
+                norm_fn(2 * channels), nn.GLU(1),
+                LayerScale(channels, init),
+            ]
+            if attn:
+                mods.insert(3, LocalState(hidden, heads=heads, ndecay=ndecay))
+            if lstm:
+                mods.insert(3, BLSTM(hidden, layers=2, max_steps=200, skip=True))
+            layer = nn.Sequential(*mods)
+            self.layers.append(layer)
+    def forward(self, x):
+        for layer in self.layers:
+            x = x + layer(x)
+        return x
+class LocalState(nn.Module):
+    """Local state allows to have attention based only on data (no positional embedding),
+    but while setting a constraint on the time window (e.g. decaying penalty term).
+    Also a failed experiments with trying to provide some frequency based attention.
+    """
+    def __init__(self, channels: int, heads: int = 4, nfreqs: int = 0, ndecay: int = 4):
+        super().__init__()
+        assert channels % heads == 0, (channels, heads)
+        self.heads = heads
+        self.nfreqs = nfreqs
+        self.ndecay = ndecay
+        self.content = nn.Conv1d(channels, channels, 1)
+        self.query = nn.Conv1d(channels, channels, 1)
+        self.key = nn.Conv1d(channels, channels, 1)
+        if nfreqs:
+            self.query_freqs = nn.Conv1d(channels, heads * nfreqs, 1)
+        if ndecay:
+            self.query_decay = nn.Conv1d(channels, heads * ndecay, 1)
+            # Initialize decay close to zero (there is a sigmoid), for maximum initial window.
+            self.query_decay.weight.data *= 0.01
+            assert self.query_decay.bias is not None  # stupid type checker
+            self.query_decay.bias.data[:] = -2
+        self.proj = nn.Conv1d(channels + heads * nfreqs, channels, 1)
+    def forward(self, x):
+        B, C, T = x.shape
+        heads = self.heads
+        indexes = torch.arange(T, device=x.device, dtype=x.dtype)
+        # left index are keys, right index are queries
+        delta = indexes[:, None] - indexes[None, :]
+        queries = self.query(x).view(B, heads, -1, T)
+        keys = self.key(x).view(B, heads, -1, T)
+        # t are keys, s are queries
+        dots = torch.einsum("bhct,bhcs->bhts", keys, queries)
+        dots /= keys.shape[2]**0.5
+        if self.nfreqs:
+            periods = torch.arange(1, self.nfreqs + 1, device=x.device, dtype=x.dtype)
+            freq_kernel = torch.cos(2 * math.pi * delta / periods.view(-1, 1, 1))
+            freq_q = self.query_freqs(x).view(B, heads, -1, T) / self.nfreqs ** 0.5
+            dots += torch.einsum("fts,bhfs->bhts", freq_kernel, freq_q)
+        if self.ndecay:
+            decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype)
+            decay_q = self.query_decay(x).view(B, heads, -1, T)
+            decay_q = torch.sigmoid(decay_q) / 2
+            decay_kernel = - decays.view(-1, 1, 1) * delta.abs() / self.ndecay**0.5
+            dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q)
+        # Kill self reference.
+        dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100)
+        weights = torch.softmax(dots, dim=2)
+        content = self.content(x).view(B, heads, -1, T)
+        result = torch.einsum("bhts,bhct->bhcs", weights, content)
+        if self.nfreqs:
+            time_sig = torch.einsum("bhts,fts->bhfs", weights, freq_kernel)
+            result = torch.cat([result, time_sig], 2)
+        result = result.reshape(B, -1, T)
+        return x + self.proj(result)
+class Demucs(nn.Module):
+    @capture_init
+    def __init__(self,
+                 sources,
+                 # Channels
+                 audio_channels=2,
+                 channels=64,
+                 growth=2.,
+                 # Main structure
+                 depth=6,
+                 rewrite=True,
+                 lstm_layers=0,
+                 # Convolutions
+                 kernel_size=8,
+                 stride=4,
+                 context=1,
+                 # Activations
+                 gelu=True,
+                 glu=True,
+                 # Normalization
+                 norm_starts=4,
+                 norm_groups=4,
+                 # DConv residual branch
+                 dconv_mode=1,
+                 dconv_depth=2,
+                 dconv_comp=4,
+                 dconv_attn=4,
+                 dconv_lstm=4,
+                 dconv_init=1e-4,
+                 # Pre/post processing
+                 normalize=True,
+                 resample=True,
+                 # Weight init
+                 rescale=0.1,
+                 # Metadata
+                 samplerate=44100,
+                 segment=4 * 10):
+        """
+        Args:
+            sources (list[str]): list of source names
+            audio_channels (int): stereo or mono
+            channels (int): first convolution channels
+            depth (int): number of encoder/decoder layers
+            growth (float): multiply (resp divide) number of channels by that
+                for each layer of the encoder (resp decoder)
+            depth (int): number of layers in the encoder and in the decoder.
+            rewrite (bool): add 1x1 convolution to each layer.
+            lstm_layers (int): number of lstm layers, 0 = no lstm. Deactivated
+                by default, as this is now replaced by the smaller and faster small LSTMs
+                in the DConv branches.
+            kernel_size (int): kernel size for convolutions
+            stride (int): stride for convolutions
+            context (int): kernel size of the convolution in the
+                decoder before the transposed convolution. If > 1,
+                will provide some context from neighboring time steps.
+            gelu: use GELU activation function.
+            glu (bool): use glu instead of ReLU for the 1x1 rewrite conv.
+            norm_starts: layer at which group norm starts being used.
+                decoder layers are numbered in reverse order.
+            norm_groups: number of groups for group norm.
+            dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
+            dconv_depth: depth of residual DConv branch.
+            dconv_comp: compression of DConv branch.
+            dconv_attn: adds attention layers in DConv branch starting at this layer.
+            dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
+            dconv_init: initial scale for the DConv branch LayerScale.
+            normalize (bool): normalizes the input audio on the fly, and scales back
+                the output by the same amount.
+            resample (bool): upsample x2 the input and downsample /2 the output.
+            rescale (float): rescale initial weights of convolutions
+                to get their standard deviation closer to `rescale`.
+            samplerate (int): stored as meta information for easing
+                future evaluations of the model.
+            segment (float): duration of the chunks of audio to ideally evaluate the model on.
+                This is used by `demucs.apply.apply_model`.
+        """
+        super().__init__()
+        self.audio_channels = audio_channels
+        self.sources = sources
+        self.kernel_size = kernel_size
+        self.context = context
+        self.stride = stride
+        self.depth = depth
+        self.resample = resample
+        self.channels = channels
+        self.normalize = normalize
+        self.samplerate = samplerate
+        self.segment = segment
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        self.skip_scales = nn.ModuleList()
+        if glu:
+            activation = nn.GLU(dim=1)
+            ch_scale = 2
+        else:
+            activation = nn.ReLU()
+            ch_scale = 1
+        if gelu:
+            act2 = nn.GELU
+        else:
+            act2 = nn.ReLU
+        in_channels = audio_channels
+        padding = 0
+        for index in range(depth):
+            norm_fn = lambda d: nn.Identity()  # noqa
+            if index >= norm_starts:
+                norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+            encode = []
+            encode += [
+                nn.Conv1d(in_channels, channels, kernel_size, stride),
+                norm_fn(channels),
+                act2(),
+            ]
+            attn = index >= dconv_attn
+            lstm = index >= dconv_lstm
+            if dconv_mode & 1:
+                encode += [DConv(channels, depth=dconv_depth, init=dconv_init,
+                                 compress=dconv_comp, attn=attn, lstm=lstm)]
+            if rewrite:
+                encode += [
+                    nn.Conv1d(channels, ch_scale * channels, 1),
+                    norm_fn(ch_scale * channels), activation]
+            self.encoder.append(nn.Sequential(*encode))
+            decode = []
+            if index > 0:
+                out_channels = in_channels
+            else:
+                out_channels = len(self.sources) * audio_channels
+            if rewrite:
+                decode += [
+                    nn.Conv1d(channels, ch_scale * channels, 2 * context + 1, padding=context),
+                    norm_fn(ch_scale * channels), activation]
+            if dconv_mode & 2:
+                decode += [DConv(channels, depth=dconv_depth, init=dconv_init,
+                                 compress=dconv_comp, attn=attn, lstm=lstm)]
+            decode += [nn.ConvTranspose1d(channels, out_channels,
+                       kernel_size, stride, padding=padding)]
+            if index > 0:
+                decode += [norm_fn(out_channels), act2()]
+            self.decoder.insert(0, nn.Sequential(*decode))
+            in_channels = channels
+            channels = int(growth * channels)
+        channels = in_channels
+        if lstm_layers:
+            self.lstm = BLSTM(channels, lstm_layers)
+        else:
+            self.lstm = None
+        if rescale:
+            rescale_module(self, reference=rescale)
+    def valid_length(self, length):
+        """
+        Return the nearest valid length to use with the model so that
+        there is no time steps left over in a convolution, e.g. for all
+        layers, size of the input - kernel_size % stride = 0.
+        Note that input are automatically padded if necessary to ensure that the output
+        has the same length as the input.
+        """
+        if self.resample:
+            length *= 2
+        for _ in range(self.depth):
+            length = math.ceil((length - self.kernel_size) / self.stride) + 1
+            length = max(1, length)
+        for idx in range(self.depth):
+            length = (length - 1) * self.stride + self.kernel_size
+        if self.resample:
+            length = math.ceil(length / 2)
+        return int(length)
+    def forward(self, mix):
+        x = mix
+        length = x.shape[-1]
+        if self.normalize:
+            mono = mix.mean(dim=1, keepdim=True)
+            mean = mono.mean(dim=-1, keepdim=True)
+            std = mono.std(dim=-1, keepdim=True)
+            x = (x - mean) / (1e-5 + std)
+        else:
+            mean = 0
+            std = 1
+        delta = self.valid_length(length) - length
+        x = F.pad(x, (delta // 2, delta - delta // 2))
+        if self.resample:
+            x = julius.resample_frac(x, 1, 2)
+        saved = []
+        for encode in self.encoder:
+            x = encode(x)
+            saved.append(x)
+        if self.lstm:
+            x = self.lstm(x)
+        for decode in self.decoder:
+            skip = saved.pop(-1)
+            skip = center_trim(skip, x)
+            x = decode(x + skip)
+        if self.resample:
+            x = julius.resample_frac(x, 2, 1)
+        x = x * std + mean
+        x = center_trim(x, length)
+        x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))
+        return x
+    def load_state_dict(self, state, strict=True):
+        # fix a mismatch with previous generation Demucs models.
+        for idx in range(self.depth):
+            for a in ['encoder', 'decoder']:
+                for b in ['bias', 'weight']:
+                    new = f'{a}.{idx}.3.{b}'
+                    old = f'{a}.{idx}.2.{b}'
+                    if old in state and new not in state:
+                        state[new] = state.pop(old)
+        super().load_state_dict(state, strict=strict)

demucs/distrib.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Distributed training utilities.
+"""
+import logging
+import pickle
+import numpy as np
+import torch
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader, Subset
+from torch.nn.parallel.distributed import DistributedDataParallel
+from dora import distrib as dora_distrib
+logger = logging.getLogger(__name__)
+rank = 0
+world_size = 1
+def init():
+    global rank, world_size
+    if not torch.distributed.is_initialized():
+        dora_distrib.init()
+    rank = dora_distrib.rank()
+    world_size = dora_distrib.world_size()
+def average(metrics, count=1.):
+    if isinstance(metrics, dict):
+        keys, values = zip(*sorted(metrics.items()))
+        values = average(values, count)
+        return dict(zip(keys, values))
+    if world_size == 1:
+        return metrics
+    tensor = torch.tensor(list(metrics) + [1], device='cuda', dtype=torch.float32)
+    tensor *= count
+    torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
+    return (tensor[:-1] / tensor[-1]).cpu().numpy().tolist()
+def wrap(model):
+    if world_size == 1:
+        return model
+    else:
+        return DistributedDataParallel(
+            model,
+            # find_unused_parameters=True,
+            device_ids=[torch.cuda.current_device()],
+            output_device=torch.cuda.current_device())
+def barrier():
+    if world_size > 1:
+        torch.distributed.barrier()
+def share(obj=None, src=0):
+    if world_size == 1:
+        return obj
+    size = torch.empty(1, device='cuda', dtype=torch.long)
+    if rank == src:
+        dump = pickle.dumps(obj)
+        size[0] = len(dump)
+    torch.distributed.broadcast(size, src=src)
+    # size variable is now set to the length of pickled obj in all processes
+    if rank == src:
+        buffer = torch.from_numpy(np.frombuffer(dump, dtype=np.uint8).copy()).cuda()
+    else:
+        buffer = torch.empty(size[0].item(), device='cuda', dtype=torch.uint8)
+    torch.distributed.broadcast(buffer, src=src)
+    # buffer variable is now set to pickled obj in all processes
+    if rank != src:
+        obj = pickle.loads(buffer.cpu().numpy().tobytes())
+    logger.debug(f"Shared object of size {len(buffer)}")
+    return obj
+def loader(dataset, *args, shuffle=False, klass=DataLoader, **kwargs):
+    """
+    Create a dataloader properly in case of distributed training.
+    If a gradient is going to be computed you must set `shuffle=True`.
+    """
+    if world_size == 1:
+        return klass(dataset, *args, shuffle=shuffle, **kwargs)
+    if shuffle:
+        # train means we will compute backward, we use DistributedSampler
+        sampler = DistributedSampler(dataset)
+        # We ignore shuffle, DistributedSampler already shuffles
+        return klass(dataset, *args, **kwargs, sampler=sampler)
+    else:
+        # We make a manual shard, as DistributedSampler otherwise replicate some examples
+        dataset = Subset(dataset, list(range(rank, len(dataset), world_size)))
+        return klass(dataset, *args, shuffle=shuffle, **kwargs)

demucs/ema.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Inspired from https://github.com/rwightman/pytorch-image-models
+from contextlib import contextmanager
+import torch
+from .states import swap_state
+class ModelEMA:
+    """
+    Perform EMA on a model. You can switch to the EMA weights temporarily
+    with the `swap` method.
+        ema = ModelEMA(model)
+        with ema.swap():
+            # compute valid metrics with averaged model.
+    """
+    def __init__(self, model, decay=0.9999, unbias=True, device='cpu'):
+        self.decay = decay
+        self.model = model
+        self.state = {}
+        self.count = 0
+        self.device = device
+        self.unbias = unbias
+        self._init()
+    def _init(self):
+        for key, val in self.model.state_dict().items():
+            if val.dtype != torch.float32:
+                continue
+            device = self.device or val.device
+            if key not in self.state:
+                self.state[key] = val.detach().to(device, copy=True)
+    def update(self):
+        if self.unbias:
+            self.count = self.count * self.decay + 1
+            w = 1 / self.count
+        else:
+            w = 1 - self.decay
+        for key, val in self.model.state_dict().items():
+            if val.dtype != torch.float32:
+                continue
+            device = self.device or val.device
+            self.state[key].mul_(1 - w)
+            self.state[key].add_(val.detach().to(device), alpha=w)
+    @contextmanager
+    def swap(self):
+        with swap_state(self.model, self.state):
+            yield
+    def state_dict(self):
+        return {'state': self.state, 'count': self.count}
+    def load_state_dict(self, state):
+        self.count = state['count']
+        for k, v in state['state'].items():
+            self.state[k].copy_(v)

demucs/evaluate.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Test time evaluation, either using the original SDR from [Vincent et al. 2006]
+or the newest SDR definition from the MDX 2021 competition (this one will
+be reported as `nsdr` for `new sdr`).
+"""
+from concurrent import futures
+import logging
+from dora.log import LogProgress
+import numpy as np
+import musdb
+import museval
+import torch as th
+from .apply import apply_model
+from .audio import convert_audio, save_audio
+from . import distrib
+from .utils import DummyPoolExecutor
+logger = logging.getLogger(__name__)
+def new_sdr(references, estimates):
+    """
+    Compute the SDR according to the MDX challenge definition.
+    Adapted from AIcrowd/music-demixing-challenge-starter-kit (MIT license)
+    """
+    assert references.dim() == 4
+    assert estimates.dim() == 4
+    delta = 1e-7  # avoid numerical errors
+    num = th.sum(th.square(references), dim=(2, 3))
+    den = th.sum(th.square(references - estimates), dim=(2, 3))
+    num += delta
+    den += delta
+    scores = 10 * th.log10(num / den)
+    return scores
+def eval_track(references, estimates, win, hop, compute_sdr=True):
+    references = references.transpose(1, 2).double()
+    estimates = estimates.transpose(1, 2).double()
+    new_scores = new_sdr(references.cpu()[None], estimates.cpu()[None])[0]
+    if not compute_sdr:
+        return None, new_scores
+    else:
+        references = references.numpy()
+        estimates = estimates.numpy()
+        scores = museval.metrics.bss_eval(
+            references, estimates,
+            compute_permutation=False,
+            window=win,
+            hop=hop,
+            framewise_filters=False,
+            bsseval_sources_version=False)[:-1]
+        return scores, new_scores
+def evaluate(solver, compute_sdr=False):
+    """
+    Evaluate model using museval.
+    compute_sdr=False means using only the MDX definition of the SDR, which
+    is much faster to evaluate.
+    """
+    args = solver.args
+    output_dir = solver.folder / "results"
+    output_dir.mkdir(exist_ok=True, parents=True)
+    json_folder = solver.folder / "results/test"
+    json_folder.mkdir(exist_ok=True, parents=True)
+    # we load tracks from the original musdb set
+    if args.test.nonhq is None:
+        test_set = musdb.DB(args.dset.musdb, subsets=["test"], is_wav=True)
+    else:
+        test_set = musdb.DB(args.test.nonhq, subsets=["test"], is_wav=False)
+    src_rate = args.dset.musdb_samplerate
+    eval_device = 'cpu'
+    model = solver.model
+    win = int(1. * model.samplerate)
+    hop = int(1. * model.samplerate)
+    indexes = range(distrib.rank, len(test_set), distrib.world_size)
+    indexes = LogProgress(logger, indexes, updates=args.misc.num_prints,
+                          name='Eval')
+    pendings = []
+    pool = futures.ProcessPoolExecutor if args.test.workers else DummyPoolExecutor
+    with pool(args.test.workers) as pool:
+        for index in indexes:
+            track = test_set.tracks[index]
+            mix = th.from_numpy(track.audio).t().float()
+            if mix.dim() == 1:
+                mix = mix[None]
+            mix = mix.to(solver.device)
+            ref = mix.mean(dim=0)  # mono mixture
+            mix = (mix - ref.mean()) / ref.std()
+            mix = convert_audio(mix, src_rate, model.samplerate, model.audio_channels)
+            estimates = apply_model(model, mix[None],
+                                    shifts=args.test.shifts, split=args.test.split,
+                                    overlap=args.test.overlap)[0]
+            estimates = estimates * ref.std() + ref.mean()
+            estimates = estimates.to(eval_device)
+            references = th.stack(
+                [th.from_numpy(track.targets[name].audio).t() for name in model.sources])
+            if references.dim() == 2:
+                references = references[:, None]
+            references = references.to(eval_device)
+            references = convert_audio(references, src_rate,
+                                       model.samplerate, model.audio_channels)
+            if args.test.save:
+                folder = solver.folder / "wav" / track.name
+                folder.mkdir(exist_ok=True, parents=True)
+                for name, estimate in zip(model.sources, estimates):
+                    save_audio(estimate.cpu(), folder / (name + ".mp3"), model.samplerate)
+            pendings.append((track.name, pool.submit(
+                eval_track, references, estimates, win=win, hop=hop, compute_sdr=compute_sdr)))
+        pendings = LogProgress(logger, pendings, updates=args.misc.num_prints,
+                               name='Eval (BSS)')
+        tracks = {}
+        for track_name, pending in pendings:
+            pending = pending.result()
+            scores, nsdrs = pending
+            tracks[track_name] = {}
+            for idx, target in enumerate(model.sources):
+                tracks[track_name][target] = {'nsdr': [float(nsdrs[idx])]}
+            if scores is not None:
+                (sdr, isr, sir, sar) = scores
+                for idx, target in enumerate(model.sources):
+                    values = {
+                        "SDR": sdr[idx].tolist(),
+                        "SIR": sir[idx].tolist(),
+                        "ISR": isr[idx].tolist(),
+                        "SAR": sar[idx].tolist()
+                    }
+                    tracks[track_name][target].update(values)
+        all_tracks = {}
+        for src in range(distrib.world_size):
+            all_tracks.update(distrib.share(tracks, src))
+        result = {}
+        metric_names = next(iter(all_tracks.values()))[model.sources[0]]
+        for metric_name in metric_names:
+            avg = 0
+            avg_of_medians = 0
+            for source in model.sources:
+                medians = [
+                    np.nanmedian(all_tracks[track][source][metric_name])
+                    for track in all_tracks.keys()]
+                mean = np.mean(medians)
+                median = np.median(medians)
+                result[metric_name.lower() + "_" + source] = mean
+                result[metric_name.lower() + "_med" + "_" + source] = median
+                avg += mean / len(model.sources)
+                avg_of_medians += median / len(model.sources)
+            result[metric_name.lower()] = avg
+            result[metric_name.lower() + "_med"] = avg_of_medians
+        return result

demucs/grids/__init__.py ADDED Viewed

File without changes

demucs/grids/_explorers.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dora import Explorer
+import treetable as tt
+class MyExplorer(Explorer):
+    test_metrics = ['nsdr', 'sdr_med']
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table.
+        """
+        return [
+            tt.group("train", [
+                tt.leaf("epoch"),
+                tt.leaf("reco", ".3f"),
+             ], align=">"),
+            tt.group("valid", [
+                tt.leaf("penalty", ".1f"),
+                tt.leaf("ms", ".1f"),
+                tt.leaf("reco", ".2%"),
+                tt.leaf("breco", ".2%"),
+                tt.leaf("b_nsdr", ".2f"),
+                # tt.leaf("b_nsdr_drums", ".2f"),
+                # tt.leaf("b_nsdr_bass", ".2f"),
+                # tt.leaf("b_nsdr_other", ".2f"),
+                # tt.leaf("b_nsdr_vocals", ".2f"),
+             ], align=">"),
+            tt.group("test", [
+                tt.leaf(name, ".2f")
+                for name in self.test_metrics
+             ], align=">")
+        ]
+    def process_history(self, history):
+        train = {
+            'epoch': len(history),
+        }
+        valid = {}
+        test = {}
+        best_v_main = float('inf')
+        breco = float('inf')
+        for metrics in history:
+            train.update(metrics['train'])
+            valid.update(metrics['valid'])
+            if 'main' in metrics['valid']:
+                best_v_main = min(best_v_main, metrics['valid']['main']['loss'])
+            valid['bmain'] = best_v_main
+            valid['breco'] = min(breco, metrics['valid']['reco'])
+            breco = valid['breco']
+            if (metrics['valid']['loss'] == metrics['valid']['best'] or
+                    metrics['valid'].get('nsdr') == metrics['valid']['best']):
+                for k, v in metrics['valid'].items():
+                    if k.startswith('reco_'):
+                        valid['b_' + k[len('reco_'):]] = v
+                    if k.startswith('nsdr'):
+                        valid[f'b_{k}'] = v
+            if 'test' in metrics:
+                test.update(metrics['test'])
+            metrics = history[-1]
+        return {"train": train, "valid": valid, "test": test}

demucs/grids/mdx.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Main training for the Track A MDX models.
+"""
+from ._explorers import MyExplorer
+from ..train import main
+TRACK_A = ['0d19c1c6', '7ecf8ec1', 'c511e2ab', '7d865c68']
+@MyExplorer
+def explorer(launcher):
+    launcher.slurm_(
+        gpus=8,
+        time=3 * 24 * 60,
+        partition='learnlab')
+    # Reproduce results from MDX competition Track A
+    # This trains the first round of models. Once this is trained,
+    # you will need to schedule `mdx_refine`.
+    for sig in TRACK_A:
+        xp = main.get_xp_from_sig(sig)
+        parent = xp.cfg.continue_from
+        xp = main.get_xp_from_sig(parent)
+        launcher(xp.argv)
+        launcher(xp.argv, {'quant.diffq': 1e-4})
+        launcher(xp.argv, {'quant.diffq': 3e-4})

demucs/grids/mdx_extra.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Main training for the Track A MDX models.
+"""
+from ._explorers import MyExplorer
+from ..train import main
+TRACK_B = ['e51eebcc', 'a1d90b5c', '5d2d6c55', 'cfa93e08']
+@MyExplorer
+def explorer(launcher):
+    launcher.slurm_(
+        gpus=8,
+        time=3 * 24 * 60,
+        partition='learnlab')
+    # Reproduce results from MDX competition Track A
+    # This trains the first round of models. Once this is trained,
+    # you will need to schedule `mdx_refine`.
+    for sig in TRACK_B:
+        while sig is not None:
+            xp = main.get_xp_from_sig(sig)
+            sig = xp.cfg.continue_from
+        for dset in ['extra44', 'extra_test']:
+            sub = launcher.bind(xp.argv, dset=dset)
+            sub()
+            if dset == 'extra_test':
+                sub({'quant.diffq': 1e-4})
+                sub({'quant.diffq': 3e-4})

demucs/grids/mdx_refine.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Main training for the Track A MDX models.
+"""
+from ._explorers import MyExplorer
+from .mdx import TRACK_A
+from ..train import main
+@MyExplorer
+def explorer(launcher):
+    launcher.slurm_(
+        gpus=8,
+        time=3 * 24 * 60,
+        partition='learnlab')
+    # Reproduce results from MDX competition Track A
+    # WARNING: all the experiments in the `mdx` grid must have completed.
+    for sig in TRACK_A:
+        xp = main.get_xp_from_sig(sig)
+        launcher(xp.argv)
+        for diffq in [1e-4, 3e-4]:
+            xp_src = main.get_xp_from_sig(xp.cfg.continue_from)
+            q_argv = [f'quant.diffq={diffq}']
+            actual_src = main.get_xp(xp_src.argv + q_argv)
+            actual_src.link.load()
+            assert len(actual_src.link.history) == actual_src.cfg.epochs
+            argv = xp.argv + q_argv + [f'continue_from="{actual_src.sig}"']
+            launcher(argv)