{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "first-order-model-demo.ipynb", "provenance": [], "toc_visible": true, "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "cdO_RxQZLahB" }, "source": [ "# Demo for paper \"First Order Motion Model for Image Animation\"" ] }, { "cell_type": "markdown", "metadata": { "id": "GCDNKsEGLtR6" }, "source": [ "**Clone repository**" ] }, { "cell_type": "code", "metadata": { "id": "UCMFMJV7K-ag", "colab": { "base_uri": "https://localhost:8080/", "height": 104 }, "outputId": "836efe50-65d7-4c95-a17e-13fc3922b491" }, "source": [ "!git clone https://github.com/AliaksandrSiarohin/first-order-model" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Cloning into 'first-order-model'...\n", "remote: Enumerating objects: 246, done.\u001b[K\n", "remote: Total 246 (delta 0), reused 0 (delta 0), pack-reused 246\u001b[K\n", "Receiving objects: 100% (246/246), 71.46 MiB | 22.54 MiB/s, done.\n", "Resolving deltas: 100% (121/121), done.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "PBp6l_4bBYUL", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "702bdf2c-d6d2-41ec-cf3c-9196452544d8" }, "source": [ "cd first-order-model" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "/content/first-order-model\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "IcMX7ueZO0Oa" }, "source": [ "**Mount your Google drive folder on Colab**" ] }, { "cell_type": "code", "metadata": { "id": "tDbMA8R9OuUo", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "f92ce039-d693-4e03-879b-d0d49a121da2" }, "source": [ "from google.colab import drive\n", "drive.mount('/content/gdrive')" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Mounted at /content/gdrive\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "VsgVK1EURXkd" }, "source": [ "**Add folder https://drive.google.com/drive/folders/1kZ1gCnpfU0BnpdU47pLM_TQ6RypDDqgw?usp=sharing to your google drive.\n", "Alternativelly you can use this mirror link https://drive.google.com/drive/folders/16inDpBRPT1UC0YMGMX3dKvRnOUsf5Dhn?usp=sharing**" ] }, { "cell_type": "markdown", "metadata": { "id": "rW-ipQXPOWUo" }, "source": [ "**Load driving video and source image**" ] }, { "cell_type": "code", "metadata": { "id": "Oxi6-riLOgnm", "colab": { "base_uri": "https://localhost:8080/", "height": 453 }, "outputId": "d38a8850-9eb1-4de4-9bf2-24cbd847ca1f" }, "source": [ "import imageio\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import matplotlib.animation as animation\n", "from skimage.transform import resize\n", "from IPython.display import HTML\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "source_image = imageio.imread('/content/gdrive/My Drive/first-order-motion-model/02.png')\n", "reader = imageio.get_reader('/content/gdrive/My Drive/first-order-motion-model/04.mp4')\n", "\n", "\n", "#Resize image and video to 256x256\n", "\n", "source_image = resize(source_image, (256, 256))[..., :3]\n", "\n", "fps = reader.get_meta_data()['fps']\n", "driving_video = []\n", "try:\n", " for im in reader:\n", " driving_video.append(im)\n", "except RuntimeError:\n", " pass\n", "reader.close()\n", "\n", "driving_video = [resize(frame, (256, 256))[..., :3] for frame in driving_video]\n", "\n", "def display(source, driving, generated=None):\n", " fig = plt.figure(figsize=(8 + 4 * (generated is not None), 6))\n", "\n", " ims = []\n", " for i in range(len(driving)):\n", " cols = [source]\n", " cols.append(driving[i])\n", " if generated is not None:\n", " cols.append(generated[i])\n", " im = plt.imshow(np.concatenate(cols, axis=1), animated=True)\n", " plt.axis('off')\n", " ims.append([im])\n", "\n", " ani = animation.ArtistAnimation(fig, ims, interval=50, repeat_delay=1000)\n", " plt.close()\n", " return ani\n", " \n", "\n", "HTML(display(source_image, driving_video).to_html5_video())" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "markdown", "metadata": { "id": "xjM7ubVfWrwT" }, "source": [ "**Create a model and load checkpoints**" ] }, { "cell_type": "code", "metadata": { "id": "3FQiXqQPWt5B" }, "source": [ "from demo import load_checkpoints\n", "generator, kp_detector = load_checkpoints(config_path='config/vox-256.yaml', \n", " checkpoint_path='/content/gdrive/My Drive/first-order-motion-model/vox-cpk.pth.tar')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "fdFdasHEj3t7" }, "source": [ "**Perform image animation**" ] }, { "cell_type": "code", "metadata": { "id": "SB12II11kF4c", "colab": { "base_uri": "https://localhost:8080/", "height": 471 }, "outputId": "9e2274aa-fd55-4eed-cb50-bec72fcfb8b9" }, "source": [ "from demo import make_animation\n", "from skimage import img_as_ubyte\n", "\n", "predictions = make_animation(source_image, driving_video, generator, kp_detector, relative=True)\n", "\n", "#save resulting video\n", "imageio.mimsave('../generated.mp4', [img_as_ubyte(frame) for frame in predictions], fps=fps)\n", "#video can be downloaded from /content folder\n", "\n", "HTML(display(source_image, driving_video, predictions).to_html5_video())" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "100%|██████████| 211/211 [00:26<00:00, 7.92it/s]\n" ], "name": "stderr" }, { "output_type": "execute_result", "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "markdown", "metadata": { "id": "-tJN01xQCpqH" }, "source": [ "**In the cell above we use relative keypoint displacement to animate the objects. We can use absolute coordinates instead, but in this way all the object proporions will be inherited from the driving video. For example Putin haircut will be extended to match Trump haircut.**" ] }, { "cell_type": "code", "metadata": { "id": "aOE_W_kfC9aX", "colab": { "base_uri": "https://localhost:8080/", "height": 471 }, "outputId": "f472a888-0200-4b21-b6d2-b6f6737bc9e5" }, "source": [ "predictions = make_animation(source_image, driving_video, generator, kp_detector, relative=False, adapt_movement_scale=True)\n", "HTML(display(source_image, driving_video, predictions).to_html5_video())" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "100%|██████████| 211/211 [00:26<00:00, 7.90it/s]\n" ], "name": "stderr" }, { "output_type": "execute_result", "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 8 } ] }, { "cell_type": "markdown", "metadata": { "id": "QnXrecuX6_Kw" }, "source": [ "## Running on your data\n", "\n", "**First we need to crop a face from both source image and video, while simple graphic editor like paint can be used for cropping from image. Cropping from video is more complicated. You can use ffpmeg for this.**" ] }, { "cell_type": "code", "metadata": { "id": "brJlA_5o72Xc", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "c2becb7e-e2ee-4651-ee8a-f906b5663417" }, "source": [ "!ffmpeg -i /content/gdrive/My\\ Drive/first-order-motion-model/07.mkv -ss 00:08:57.50 -t 00:00:08 -filter:v \"crop=600:600:760:50\" -async 1 hinton.mp4" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "ffmpeg version 3.4.8-0ubuntu0.2 Copyright (c) 2000-2020 the FFmpeg developers\n", " built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)\n", " configuration: --prefix=/usr --extra-version=0ubuntu0.2 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-omx --enable-openal --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libopencv --enable-libx264 --enable-shared\n", " libavutil 55. 78.100 / 55. 78.100\n", " libavcodec 57.107.100 / 57.107.100\n", " libavformat 57. 83.100 / 57. 83.100\n", " libavdevice 57. 10.100 / 57. 10.100\n", " libavfilter 6.107.100 / 6.107.100\n", " libavresample 3. 7. 0 / 3. 7. 0\n", " libswscale 4. 8.100 / 4. 8.100\n", " libswresample 2. 9.100 / 2. 9.100\n", " libpostproc 54. 7.100 / 54. 7.100\n", "Input #0, matroska,webm, from '/content/gdrive/My Drive/first-order-motion-model/07.mkv':\n", " Metadata:\n", " ENCODER : Lavf57.83.100\n", " Duration: 00:14:59.73, start: 0.000000, bitrate: 2343 kb/s\n", " Stream #0:0(eng): Video: vp9 (Profile 0), yuv420p(tv, bt709), 1920x1080, SAR 1:1 DAR 16:9, 29.97 fps, 29.97 tbr, 1k tbn, 1k tbc (default)\n", " Metadata:\n", " DURATION : 00:14:59.665000000\n", " Stream #0:1(eng): Audio: aac (LC), 44100 Hz, stereo, fltp (default)\n", " Metadata:\n", " HANDLER_NAME : SoundHandler\n", " DURATION : 00:14:59.727000000\n", "Stream mapping:\n", " Stream #0:0 -> #0:0 (vp9 (native) -> h264 (libx264))\n", " Stream #0:1 -> #0:1 (aac (native) -> aac (native))\n", "Press [q] to stop, [?] for help\n", "-async is forwarded to lavfi similarly to -af aresample=async=1:min_hard_comp=0.100000:first_pts=0.\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0musing SAR=1/1\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0musing cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mprofile High, level 3.1\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0m264 - core 152 r2854 e9a5903 - H.264/MPEG-4 AVC codec - Copyleft 2003-2017 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=3 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00\n", "Output #0, mp4, to 'hinton.mp4':\n", " Metadata:\n", " encoder : Lavf57.83.100\n", " Stream #0:0(eng): Video: h264 (libx264) (avc1 / 0x31637661), yuv420p, 600x600 [SAR 1:1 DAR 1:1], q=-1--1, 29.97 fps, 30k tbn, 29.97 tbc (default)\n", " Metadata:\n", " DURATION : 00:14:59.665000000\n", " encoder : Lavc57.107.100 libx264\n", " Side data:\n", " cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: -1\n", " Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 128 kb/s (default)\n", " Metadata:\n", " HANDLER_NAME : SoundHandler\n", " DURATION : 00:14:59.727000000\n", " encoder : Lavc57.107.100 aac\n", "frame= 240 fps=2.5 q=-1.0 Lsize= 1301kB time=00:00:08.01 bitrate=1330.6kbits/s speed=0.0844x \n", "video:1166kB audio:125kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.761764%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mframe I:1 Avg QP:22.44 size: 28019\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mframe P:62 Avg QP:23.31 size: 12894\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mframe B:177 Avg QP:28.63 size: 2068\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mconsecutive B-frames: 0.8% 1.7% 2.5% 95.0%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mmb I I16..4: 12.7% 76.2% 11.1%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mmb P I16..4: 1.9% 8.9% 1.1% P16..4: 35.3% 21.3% 10.8% 0.0% 0.0% skip:20.7%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mmb B I16..4: 0.0% 0.1% 0.0% B16..8: 39.1% 5.4% 1.0% direct: 1.4% skip:52.9% L0:35.4% L1:48.5% BI:16.2%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0m8x8 transform intra:75.2% inter:77.3%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mcoded y,uvDC,uvAC intra: 61.9% 52.1% 5.8% inter: 15.2% 6.9% 0.0%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mi16 v,h,dc,p: 69% 8% 8% 15%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mi8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 25% 10% 19% 5% 8% 11% 8% 9% 6%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mi4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 23% 8% 11% 5% 12% 21% 7% 9% 4%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mi8c dc,h,v,p: 53% 20% 19% 8%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mWeighted P-Frames: Y:21.0% UV:1.6%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mref P L0: 57.9% 21.2% 14.0% 5.9% 1.1%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mref B L0: 93.5% 5.3% 1.2%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mref B L1: 97.4% 2.6%\n", "\u001b[1;36m[libx264 @ 0x55709b1c4800] \u001b[0mkb/s:1192.28\n", "\u001b[1;36m[aac @ 0x55709b1c5700] \u001b[0mQavg: 534.430\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "NSHSxV8iGybI" }, "source": [ "**Another posibility is to use some screen recording tool, or if you need to crop many images at ones use face detector(https://github.com/1adrianb/face-alignment) , see https://github.com/AliaksandrSiarohin/video-preprocessing for preprcessing of VoxCeleb.** " ] }, { "cell_type": "code", "metadata": { "id": "d8kQ3U7MHqh-", "colab": { "base_uri": "https://localhost:8080/", "height": 471 }, "outputId": "8890e858-ac38-4d59-b014-cc4ed974f276" }, "source": [ "source_image = imageio.imread('/content/gdrive/My Drive/first-order-motion-model/09.png')\n", "driving_video = imageio.mimread('hinton.mp4', memtest=False)\n", "\n", "\n", "#Resize image and video to 256x256\n", "\n", "source_image = resize(source_image, (256, 256))[..., :3]\n", "driving_video = [resize(frame, (256, 256))[..., :3] for frame in driving_video]\n", "\n", "predictions = make_animation(source_image, driving_video, generator, kp_detector, relative=True,\n", " adapt_movement_scale=True)\n", "\n", "HTML(display(source_image, driving_video, predictions).to_html5_video())" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "100%|██████████| 240/240 [00:30<00:00, 7.94it/s]\n" ], "name": "stderr" }, { "output_type": "execute_result", "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] } ] }