somanchiu
diff --git a/‎INSwapper.py‎
Lines changed: 140 additions & 0 deletions b/‎INSwapper.py‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎Image.py‎
Lines changed: 27 additions & 0 deletions b/‎Image.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 93 additions & 0 deletions b/‎README.md‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎StyleTransferLoss.py‎
Lines changed: 120 additions & 0 deletions b/‎StyleTransferLoss.py‎
Lines changed: 120 additions & 0 deletions
@@ -0,0 +1,140 @@
+import numpy as np
+import onnxruntime
+import cv2
+import onnx
+from onnx import numpy_helper
+from insightface.utils import face_align
+
+class INSwapper():
+    def __init__(self, model_file=None, session=None):
+        self.model_file = model_file
+        self.session = session
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        self.emap = numpy_helper.to_array(graph.initializer[-1])
+
+        # emapFile = f'training/dataset/v6/emap.npy'
+        # np.save(emapFile, self.emap)
+        # emap = np.load(emapFile)
+
+        self.input_mean = 0.0
+        self.input_std = 255.0
+        #print('input mean and std:', model_file, self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        inputs = self.session.get_inputs()
+        self.input_names = []
+        for inp in inputs:
+            self.input_names.append(inp.name)
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        output_shape = outputs[0].shape
+        input_cfg = inputs[0]
+        input_shape = input_cfg.shape
+        self.input_shape = input_shape
+        print('inswapper-shape:', self.input_shape)
+        self.input_size = tuple(input_shape[2:4][::-1])
+
+    def forward(self, img, latent):
+        img = (img - self.input_mean) / self.input_std
+        pred = self.session.run(self.output_names, {self.input_names[0]: img, self.input_names[1]: latent})[0]
+        return pred
+
+    def predict(self, blob, latent):
+        input = {self.input_names[0]: blob, self.input_names[1]: latent}
+        pred = self.session.run(self.output_names, input)[0]
+        return pred
+    
+    def test(self, img, target_face, source_face):
+        aimg, M = face_align.norm_crop2(img, target_face.kps, self.input_size[0])
+        blob = cv2.dnn.blobFromImage(aimg, 1.0 / self.input_std, self.input_size,
+                                (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        latent = source_face.normed_embedding.reshape((1,-1))
+        latent = np.dot(latent, self.emap)
+        latent /= np.linalg.norm(latent)
+
+        pred = self.predict(blob, latent)
+        return pred
+    
+    def getBlob(self, aimg):
+        blob = cv2.dnn.blobFromImage(aimg, 1.0 / self.input_std, self.input_size,
+                                (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        return blob
+    
+    def getLatent(self, source_face):
+        latent = source_face.normed_embedding.reshape((1,-1))
+        latent = np.dot(latent, self.emap)
+        latent /= np.linalg.norm(latent)
+
+        return latent
+    
+    def swap(self, alignedTargetFace, source_face):
+        latent = self.getLatent(source_face)
+
+        pred = self.predict(self.getBlob(alignedTargetFace), latent)
+        #print(latent.shape, latent.dtype, pred.shape)
+        img_fake = pred.transpose((0,2,3,1))[0]
+        bgr_fake = np.clip(255 * img_fake, 0, 255).astype(np.uint8)[:,:,::-1]
+
+        return bgr_fake
+    
+    def swapAndPasteBack(self, img, alignedTargetFace, alignedTargetFaceM, bgr_fake):
+        target_img = img
+        fake_diff = bgr_fake.astype(np.float32) - alignedTargetFace.astype(np.float32)
+        fake_diff = np.abs(fake_diff).mean(axis=2)
+        fake_diff[:2,:] = 0
+        fake_diff[-2:,:] = 0
+        fake_diff[:,:2] = 0
+        fake_diff[:,-2:] = 0
+        IM = cv2.invertAffineTransform(alignedTargetFaceM)
+        img_white = np.full((alignedTargetFace.shape[0],alignedTargetFace.shape[1]), 255, dtype=np.float32)
+        bgr_fake = cv2.warpAffine(bgr_fake, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+        img_white = cv2.warpAffine(img_white, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+        fake_diff = cv2.warpAffine(fake_diff, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+        img_white[img_white>20] = 255
+        fthresh = 10
+        fake_diff[fake_diff<fthresh] = 0
+        fake_diff[fake_diff>=fthresh] = 255
+        img_mask = img_white
+        mask_h_inds, mask_w_inds = np.where(img_mask==255)
+        mask_h = np.max(mask_h_inds) - np.min(mask_h_inds)
+        mask_w = np.max(mask_w_inds) - np.min(mask_w_inds)
+        mask_size = int(np.sqrt(mask_h*mask_w))
+        k = max(mask_size//10, 10)
+        #k = max(mask_size//20, 6)
+        #k = 6
+        kernel = np.ones((k,k),np.uint8)
+        img_mask = cv2.erode(img_mask,kernel,iterations = 1)
+        kernel = np.ones((2,2),np.uint8)
+        fake_diff = cv2.dilate(fake_diff,kernel,iterations = 1)
+        k = max(mask_size//20, 5)
+        #k = 3
+        #k = 3
+        kernel_size = (k, k)
+        blur_size = tuple(2*i+1 for i in kernel_size)
+        img_mask = cv2.GaussianBlur(img_mask, blur_size, 0)
+        k = 5
+        kernel_size = (k, k)
+        blur_size = tuple(2*i+1 for i in kernel_size)
+        fake_diff = cv2.GaussianBlur(fake_diff, blur_size, 0)
+        img_mask /= 255
+        fake_diff /= 255
+        #img_mask = fake_diff
+        img_mask = np.reshape(img_mask, [img_mask.shape[0],img_mask.shape[1],1])
+        fake_merged = img_mask * bgr_fake + (1-img_mask) * target_img.astype(np.float32)
+        fake_merged = fake_merged.astype(np.uint8)
+        return fake_merged
+
+    def get(self, img, target_face, source_face, paste_back=True):
+        aimg, M = face_align.norm_crop2(img, target_face.kps, self.input_size[0])
+
+        bgr_fake = self.swap(aimg, source_face)
+        if not paste_back:
+            return bgr_fake, M
+        else:
+            self.swapAndPasteBack(img, aimg, M, bgr_fake)
+
@@ -0,0 +1,27 @@
+
+import cv2
+import numpy as np
+
+emap = np.load("emap.npy")
+input_std = 255.0
+input_mean = 0.0
+input_size = (128, 128)
+
+def postprocess_face(face_tensor):
+    face_tensor = face_tensor.squeeze().cpu().detach()
+    face_np = (face_tensor.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
+    face_np = cv2.cvtColor(face_np, cv2.COLOR_RGB2BGR)
+
+    return face_np
+
+def getBlob(aimg):
+    blob = cv2.dnn.blobFromImage(aimg, 1.0 / input_std, input_size,
+                            (input_mean, input_mean, input_mean), swapRB=True)
+    return blob
+
+def getLatent(source_face):
+    latent = source_face.normed_embedding.reshape((1,-1))
+    latent = np.dot(latent, emap)
+    latent /= np.linalg.norm(latent)
+
+    return latent
@@ -0,0 +1,93 @@
+# ReSwapper
+
+ReSwapper aims to reproduce the implementation of inswapper. This repository provides code for training, inference, and includes pretrained weights.
+
+Here is the comparesion of the output of Inswapper and Reswapper.
+| Target | Source | Inswapper Output | Reswapper Output (Step 429500) |
+|--------|--------|--------|--------|
+| ![targetImg](example\1\target.jpg) |![targetImg](example\1\source.jpg) | ![targetImg](example\1\inswapperOutput.jpg) | ![targetImg](example\1\reswapperOutput.jpg) |
+| ![targetImg](example\2\target.jpg) |![targetImg](example\2\source.jpg) | ![targetImg](example\2\inswapperOutput.jpg) | ![targetImg](example\2\reswapperOutput.jpg) |
+| ![targetImg](example\3\target.jpg) |![targetImg](example\3\source.png) | ![targetImg](example\3\inswapperOutput.jpg) | ![targetImg](example\3\reswapperOutput.jpg) |
+
+## Installation
+
+```bash
+git clone https://github.com/somanchiu/ReSwapper.git
+cd ReSwapper
+python -m venv venv
+
+venv\scripts\activate
+
+pip install -r requirements.txt
+
+pip install torch torchvision --force --index-url https://download.pytorch.org/whl/cu121
+pip install onnxruntime-gpu --force --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+```
+
+## The details of inswapper
+
+### Model architecture
+The inswapper model architecture can be visualized in [Netron](https://netron.app). You can compare with ReSwapper implementation to see architectural similarities
+
+We can also use the following Python code to get more details:
+```python
+model = onnx.load('test.onnx')
+printable_graph=onnx.helper.printable_graph(model.graph)
+```
+
+### Model input
+- target: [1, 3, 128, 128] shape, normalized to [-1, 1] range
+- source (latent): [1, 512] shape, the features of the source face
+    - Calculation of latent, "emap" can be extracted from the original inswapper model.
+        ```python
+        latent = source_face.normed_embedding.reshape((1,-1))
+        latent = np.dot(latent, emap)
+        latent /= np.linalg.norm(latent)
+        ```
+
+
+### Loss Functions
+There is no information released from insightface. It is an important part of the training. However, there are a lot of articles and papers that can be referenced. By reading a substantial number of articles and papers on face swapping, ID fidelity, and style transfer, you'll frequently encounter the following keywords:
+- content loss
+- style loss/id loss
+- perceptual loss
+
+## Training
+### 0. Pretrained weights (Optional)
+If you don't want to train the model from scratch, you can download the pretrained weights and pass model_path into the train function in train.py.
+
+### 1. Dataset Preparation
+Download [FFHQ](https://www.kaggle.com/datasets/arnaud58/flickrfaceshq-dataset-ffhq) to use as target and source images. For the swaped face images, we can use the inswapper output.
+
+### 2. Model Training
+
+Optimizer: Adam
+
+Rearning rate: 0.0001
+
+Modify the code in train.py if needed. Then, execute:
+```python
+python train.py
+```
+
+The model will be saved as "reswapper-\<total steps\>.pth".
+
+## Notes
+- Do not stop the training too early.
+
+- I'm using an RTX3060 12GB for training. It takes around 12 hours for 50,000 steps.
+- The optimizer may need to be changed to SGD for the final training, as many articles show that SGD can result in lower loss.
+
+## Inference
+```python
+python swap.py
+```
+
+## Pretrained Model
+
+- [reswapper-429500.pth](https://huggingface.co/somanchiu/reswapper/tree/main)
+
+## To Do
+- Create 512 resolution model
+- Implement face paste-back functionality
+- Add emap to the onnx file
@@ -0,0 +1,120 @@
+import cv2
+import torch
+import torch.nn as nn
+import numpy as np
+from insightface.app import FaceAnalysis
+from pytorch_msssim import ssim
+
+class StyleTransferLoss(nn.Module):
+    def __init__(self, device='cuda', inswapper=None):
+        super(StyleTransferLoss, self).__init__()
+        self.face_analysis = FaceAnalysis(providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+        self.face_analysis.prepare(ctx_id=0, det_size=(128, 128))
+        self.device = device
+        self.inswapper = inswapper
+        self.cosine_similarity = nn.CosineSimilarity(dim=0)
+        
+        # Content loss
+        self.content_loss = nn.MSELoss()
+        
+        # Style loss
+        self.style_loss = nn.MSELoss()
+        
+        # Face identity loss
+        self.identity_loss = nn.CosineSimilarity(dim=1, eps=1e-6)
+
+    def gram_matrix(self, input):
+        # a, b, c, d = input.size()
+        # features = input.view(a * b, c * d)
+        G = torch.mm(input, input.t())
+        return G
+
+    def extract_face_embedding(self, image):
+        # Convert torch tensor to numpy array
+        face_tensor = image.squeeze().cpu().detach()
+        # face_tensor = (face_tensor * 0.5 + 0.5).clamp(0, 1)
+        face_np = (face_tensor.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
+        face_np = cv2.cvtColor(face_np, cv2.COLOR_RGB2BGR)
+
+        # Extract face embedding
+        faces = self.face_analysis.get(face_np)
+        if len(faces) == 0:
+            return None
+        return torch.tensor(faces[0].normed_embedding).to(self.device)
+    
+    def extract_face_latent(self, image):
+        # Convert torch tensor to numpy array
+        face_tensor = image.squeeze().cpu().detach()
+        # face_tensor = (face_tensor * 0.5 + 0.5).clamp(0, 1)
+        face_np = (face_tensor.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
+        face_np = cv2.cvtColor(face_np, cv2.COLOR_RGB2BGR)
+
+        # Extract face embedding
+        faces = self.face_analysis.get(face_np)
+        if len(faces) == 0:
+            return None
+        return torch.tensor(self.inswapper.getLatent(faces[0])[0]).to(self.device)
+    
+    def get_style_loss(self, latent1, latent2):
+        # target = torch.tensor([1.0]).to("cuda")
+        similarity = torch.dot(latent1, latent2)
+        
+        # # Binary Cross-Entropy Loss
+        # epsilon = 1e-7  # Small value to avoid log(0)
+        # loss = -target * torch.log(similarity + epsilon) - (1 - target) * torch.log(1 - similarity + epsilon)
+        
+        return 1 - similarity
+
+    def forward(self, output_image, target_content, target_face_latent, source_face_latent):
+        # Content loss
+        # content_loss = self.content_loss(output_image, target_content)
+        content_loss = 1 - ssim(output_image, target_content, data_range=1.0)
+ 
+        # Style loss
+        # style_loss = 0
+        # for out_feature, style_feature in zip(output_features, style_features):
+        #     out_gram = self.gram_matrix(out_feature)
+        #     style_gram = self.gram_matrix(style_feature)
+        #     style_loss += self.style_loss(out_gram, style_gram)
+        
+        # Face identity loss
+
+        output_embedding = self.extract_face_latent(output_image)
+        target_embedding = self.extract_face_latent(target_content)
+
+        identity_loss = None
+        euclidean_distance = None
+        
+        if output_embedding is not None and target_embedding is not None:
+            similarity = self.cosine_similarity(output_embedding, target_embedding)
+            # similarity2 = self.cosine_similarity(output_embedding, torch.tensor(target_face_latent).to(self.device))
+            # similarity2 = (similarity2 + 1) / 2
+            identity_loss = 1-((similarity + 1) / 2)
+            identity_loss = identity_loss ** 2 * 10
+            # euclidean_distance = torch.sqrt(torch.sum((output_embedding - target_embedding) ** 2))
+            # similarityA = self.cosine_similarity(output_embedding, output_embedding)
+            # similarityB = self.cosine_similarity(target_embedding, target_embedding)
+
+            # identity_loss +=similarity2
+            # margin = 0.2
+            # identity_loss = nn.functional.relu(margin - similarity)
+
+            # target = torch.tensor([1.0]).to("cuda")
+            # # Binary Cross-Entropy Loss
+            # loss = -target * torch.log(similarity) - (1 - target) * torch.log(1 - similarity)
+            
+            # identity_loss= loss.mean()
+            # identity_loss = 1 - self.identity_loss(output_embedding.unsqueeze(0), target_embedding.unsqueeze(0)).mean()
+            # identity_loss = self.get_style_loss(output_embedding, target_embedding)
+            # identity_loss = self.content_loss(output_embedding, target_embedding)
+            # identity_loss = 1 - torch.nn.functional.cosine_similarity(output_embedding, target_embedding, dim=0)
+            # identity_loss = torch.tensor(0.0).to(self.device)
+
+        # Total loss (you can adjust the weights as needed)
+        # total_loss = content_loss*0.1 + identity_loss
+
+        return content_loss, identity_loss, euclidean_distance
+
+# Usage example:
+# loss_fn = StyleTransferLoss()
+# total_loss, content_loss, style_loss, identity_loss = loss_fn(content_features, style_features, output_image, target_content, target_style)