diff --git a/examples/__init__.py b/examples/__init__.py
index e69de29..6d707f2 100644
--- a/examples/__init__.py
+++ b/examples/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/examples/fairseq/__init__.py b/examples/fairseq/__init__.py
index e69de29..6d707f2 100644
--- a/examples/fairseq/__init__.py
+++ b/examples/fairseq/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/examples/fairseq/generate.py b/examples/fairseq/generate.py
index 020d974..7c46266 100644
--- a/examples/fairseq/generate.py
+++ b/examples/fairseq/generate.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import models
 import tasks
 
diff --git a/examples/fairseq/interactive.py b/examples/fairseq/interactive.py
index 5d824ab..bcf6b64 100644
--- a/examples/fairseq/interactive.py
+++ b/examples/fairseq/interactive.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import models
 import tasks
 
diff --git a/examples/fairseq/models/__init__.py b/examples/fairseq/models/__init__.py
index 05bac79..cbaa46b 100644
--- a/examples/fairseq/models/__init__.py
+++ b/examples/fairseq/models/__init__.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import argparse
 import importlib
 import os
diff --git a/examples/fairseq/models/bert.py b/examples/fairseq/models/bert.py
index 75d28f4..d7c3202 100644
--- a/examples/fairseq/models/bert.py
+++ b/examples/fairseq/models/bert.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import math
 import logging
 from typing import Any, Dict, List, Optional
diff --git a/examples/fairseq/models/language_modeling.py b/examples/fairseq/models/language_modeling.py
index 5b762a8..d754a8f 100644
--- a/examples/fairseq/models/language_modeling.py
+++ b/examples/fairseq/models/language_modeling.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/examples/fairseq/models/machine_translation.py b/examples/fairseq/models/machine_translation.py
index 009c1a0..05e3633 100644
--- a/examples/fairseq/models/machine_translation.py
+++ b/examples/fairseq/models/machine_translation.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/examples/fairseq/tasks/__init__.py b/examples/fairseq/tasks/__init__.py
index 61e32ef..86d3d37 100644
--- a/examples/fairseq/tasks/__init__.py
+++ b/examples/fairseq/tasks/__init__.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import argparse
 import importlib
 import os
diff --git a/examples/fairseq/tasks/data/__init__.py b/examples/fairseq/tasks/data/__init__.py
index e69de29..6d707f2 100644
--- a/examples/fairseq/tasks/data/__init__.py
+++ b/examples/fairseq/tasks/data/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/examples/fairseq/tasks/data/basic_loader.py b/examples/fairseq/tasks/data/basic_loader.py
index 089dd07..ca9a3b5 100644
--- a/examples/fairseq/tasks/data/basic_loader.py
+++ b/examples/fairseq/tasks/data/basic_loader.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import math
 import re
 import sys
diff --git a/examples/fairseq/tasks/data/mlm_loader.py b/examples/fairseq/tasks/data/mlm_loader.py
index 831355c..4d8c712 100644
--- a/examples/fairseq/tasks/data/mlm_loader.py
+++ b/examples/fairseq/tasks/data/mlm_loader.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import glob
 import os
 import torch
diff --git a/examples/fairseq/tasks/data/utils.py b/examples/fairseq/tasks/data/utils.py
index 128bc6a..eb2310e 100644
--- a/examples/fairseq/tasks/data/utils.py
+++ b/examples/fairseq/tasks/data/utils.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import os
 import gzip
 import numpy as np
diff --git a/examples/fairseq/tasks/pretraining.py b/examples/fairseq/tasks/pretraining.py
index 3f4aeef..d935b91 100644
--- a/examples/fairseq/tasks/pretraining.py
+++ b/examples/fairseq/tasks/pretraining.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/examples/fairseq/train.py b/examples/fairseq/train.py
index 6587ee2..2c2b120 100644
--- a/examples/fairseq/train.py
+++ b/examples/fairseq/train.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import models
 import tasks
 
diff --git a/examples/fairseq/utils/__init__.py b/examples/fairseq/utils/__init__.py
index e69de29..6d707f2 100644
--- a/examples/fairseq/utils/__init__.py
+++ b/examples/fairseq/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/examples/fairseq/utils/sparse_clip.py b/examples/fairseq/utils/sparse_clip.py
index 633b71e..6f244dc 100644
--- a/examples/fairseq/utils/sparse_clip.py
+++ b/examples/fairseq/utils/sparse_clip.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import torch
 import warnings
 from fairseq.utils import multi_tensor_l2norm_available, multi_tensor_total_norm
diff --git a/setup.py b/setup.py
index 9030581..5cf8853 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 from io import open
 from setuptools import find_packages, setup
 
diff --git a/tests/__init__.py b/tests/__init__.py
index e69de29..6d707f2 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/tests/test_decoder.py b/tests/test_decoder.py
index 257f2c0..d95080c 100644
--- a/tests/test_decoder.py
+++ b/tests/test_decoder.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import pytest
 from torchscale.architecture.config import DecoderConfig
 from torchscale.architecture.decoder import Decoder
diff --git a/tests/test_encoder.py b/tests/test_encoder.py
index 922b881..d179956 100644
--- a/tests/test_encoder.py
+++ b/tests/test_encoder.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import pytest
 from torchscale.architecture.config import EncoderConfig
 from torchscale.architecture.encoder import Encoder
diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py
index 9855fa0..19672c0 100644
--- a/tests/test_encoder_decoder.py
+++ b/tests/test_encoder_decoder.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import pytest
 from torchscale.architecture.config import EncoderDecoderConfig
 from torchscale.architecture.encoder_decoder import EncoderDecoder
diff --git a/torchscale/__init__.py b/torchscale/__init__.py
index e69de29..6d707f2 100644
--- a/torchscale/__init__.py
+++ b/torchscale/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/torchscale/architecture/__init__.py b/torchscale/architecture/__init__.py
index e69de29..6d707f2 100644
--- a/torchscale/architecture/__init__.py
+++ b/torchscale/architecture/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/torchscale/architecture/config.py b/torchscale/architecture/config.py
index ad1621a..b424582 100644
--- a/torchscale/architecture/config.py
+++ b/torchscale/architecture/config.py
@@ -1,4 +1,5 @@
-
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
 
 class EncoderConfig(object):
     def __init__(self, **kwargs):
diff --git a/torchscale/architecture/decoder.py b/torchscale/architecture/decoder.py
index 0f816f1..7b35b80 100644
--- a/torchscale/architecture/decoder.py
+++ b/torchscale/architecture/decoder.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import math
 import torch
 import torch.nn as nn
diff --git a/torchscale/architecture/encoder.py b/torchscale/architecture/encoder.py
index 18aadb0..32b71cc 100644
--- a/torchscale/architecture/encoder.py
+++ b/torchscale/architecture/encoder.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
 
 import math
 import torch
diff --git a/torchscale/architecture/encoder_decoder.py b/torchscale/architecture/encoder_decoder.py
index fbe9219..6c57015 100644
--- a/torchscale/architecture/encoder_decoder.py
+++ b/torchscale/architecture/encoder_decoder.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import torch.nn as nn
 from torchscale.architecture.encoder import Encoder
 from torchscale.architecture.decoder import Decoder
diff --git a/torchscale/architecture/utils.py b/torchscale/architecture/utils.py
index cf21997..d866458 100644
--- a/torchscale/architecture/utils.py
+++ b/torchscale/architecture/utils.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import torch.nn as nn
 from torchscale.component.multihead_attention import MultiheadAttention
 from torchscale.component.multiway_network import MultiwayNetwork
diff --git a/torchscale/component/__init__.py b/torchscale/component/__init__.py
index e69de29..6d707f2 100644
--- a/torchscale/component/__init__.py
+++ b/torchscale/component/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/torchscale/component/droppath.py b/torchscale/component/droppath.py
index cfe0a3c..3c40d35 100644
--- a/torchscale/component/droppath.py
+++ b/torchscale/component/droppath.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 from timm.models.layers import drop_path
 import torch.nn as nn
 
diff --git a/torchscale/component/embedding.py b/torchscale/component/embedding.py
index b4c285c..678a39b 100644
--- a/torchscale/component/embedding.py
+++ b/torchscale/component/embedding.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/torchscale/component/feedforward_network.py b/torchscale/component/feedforward_network.py
index c4045fd..f2c3c49 100644
--- a/torchscale/component/feedforward_network.py
+++ b/torchscale/component/feedforward_network.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/torchscale/component/multihead_attention.py b/torchscale/component/multihead_attention.py
index e4e6ec7..9b8dd3e 100644
--- a/torchscale/component/multihead_attention.py
+++ b/torchscale/component/multihead_attention.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import math
 import torch
 from torch import nn
diff --git a/torchscale/component/multiway_network.py b/torchscale/component/multiway_network.py
index 340d045..bd4224d 100644
--- a/torchscale/component/multiway_network.py
+++ b/torchscale/component/multiway_network.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import copy
 import torch
 import torch.nn as nn
diff --git a/torchscale/component/relative_position_bias.py b/torchscale/component/relative_position_bias.py
index abb00e5..73d382c 100644
--- a/torchscale/component/relative_position_bias.py
+++ b/torchscale/component/relative_position_bias.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import math
 import torch
 import torch.nn as nn
diff --git a/torchscale/component/xmoe/__init__.py b/torchscale/component/xmoe/__init__.py
index e69de29..6d707f2 100644
--- a/torchscale/component/xmoe/__init__.py
+++ b/torchscale/component/xmoe/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file
diff --git a/torchscale/component/xmoe/moe_layer.py b/torchscale/component/xmoe/moe_layer.py
index 6944bfd..a621b7a 100644
--- a/torchscale/component/xmoe/moe_layer.py
+++ b/torchscale/component/xmoe/moe_layer.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
diff --git a/torchscale/component/xmoe/routing.py b/torchscale/component/xmoe/routing.py
index ce6bf74..c882e83 100644
--- a/torchscale/component/xmoe/routing.py
+++ b/torchscale/component/xmoe/routing.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
diff --git a/torchscale/model/BEiT3.py b/torchscale/model/BEiT3.py
index e5cb7b9..84bfe52 100644
--- a/torchscale/model/BEiT3.py
+++ b/torchscale/model/BEiT3.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
 import torch
 import torch.nn as nn
 from torchscale.architecture.encoder import Encoder
diff --git a/torchscale/model/__init__.py b/torchscale/model/__init__.py
index e69de29..6d707f2 100644
--- a/torchscale/model/__init__.py
+++ b/torchscale/model/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
\ No newline at end of file