import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/checkpoint/checkpoint (7) 2.pth
/kaggle/input/checkpoint-2/checkpoint (10).pth

import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.transforms import v2
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.__version__)
#print(torch.version.cuda)
print(device)

transform = transforms.Compose(
    [
    #transforms.v2.AugMix(),
    transforms.RandomHorizontalFlip(1),
    #transforms.RandomVerticalFlip(),
    #transforms.RandomRotation(15),  # Random rotation between -15 to 15 degrees
    #transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
    transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])




transform_1 = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

#mixup = torchvision.transforms.v2.Mixup(num_classes=10)

trainset_1 = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainset_2 = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform_1)
trainset = torch.utils.data.ConcatDataset([trainset_1,trainset_2])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=4)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_1)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=4)


# Shuffle the data in the existing DataLoader
reshuffled_train_loader = torch.utils.data.DataLoader(trainloader.dataset, batch_size=trainloader.batch_size, shuffle=True)



classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  warnings.warn(_BETA_TRANSFORMS_WARNING)
/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  warnings.warn(_BETA_TRANSFORMS_WARNING)

2.0.0
cuda:0
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz

100%|██████████| 170498071/170498071 [00:03<00:00, 44073594.86it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Files already downloaded and verified

import torch.nn as nn
import torch.nn.functional as F

def res(input_channel):
    block = nn.Sequential(
          nn.Conv2d(input_channel,input_channel,3, padding = 1, bias=False),
          nn.BatchNorm2d(input_channel),
          nn.ReLU(True),
          nn.Conv2d(input_channel,input_channel,3, padding = 1, bias=False),
          nn.BatchNorm2d(input_channel),
        )
    
    return nn.Sequential(*block)

def conv_block(input_channel, output_channel, filter_size = 3,padding = 1):
    block = nn.Sequential(
          nn.Conv2d(input_channel,output_channel,filter_size, stride = 2, padding = padding, bias = False),
          nn.BatchNorm2d(output_channel),
          nn.ReLU(True),
          nn.Conv2d(output_channel,output_channel,filter_size, stride = 1, padding = padding, bias = False),
          nn.BatchNorm2d(output_channel),
        )
    
    return nn.Sequential(*block)

def start(input_channel, output_channel, filter_size = 3, stride = 1, padding = 1):
    block = nn.Sequential(
          nn.Conv2d(input_channel,output_channel,filter_size, stride = stride, padding = padding, bias = False),
          nn.BatchNorm2d(output_channel),
          nn.ReLU(True),
        )
    
    return nn.Sequential(*block)

def downsample(input_channel, output_channel):
    block = nn.Sequential(
          nn.Conv2d(input_channel,output_channel,1, stride = 2, bias = False),
          nn.BatchNorm2d(output_channel),
        )
    
    return nn.Sequential(*block)


class resnet18(nn.Module):
    def __init__(self):
        super(resnet18, self).__init__()
        self.conv1 = start(3, 64)
        self.pool_1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.res1_1 = res(64)
        self.res1_2 = res(64)
        
        self.conv2 = conv_block(64, 128, 3, padding = 1)
        self.sample_1 = downsample(64, 128)
        self.res2_1 = res(128)
        
        self.conv3 = conv_block(128, 256, 3, padding = 1)
        self.res3_1 = res(256)
        self.sample_2 = downsample(128, 256)
      
        self.conv4 = conv_block(256, 512, 3, padding = 1)
        self.res4_1 = res(512)
        self.sample_3 = downsample(256, 512)
        self.drop = nn.Dropout()
        
        self.averagepool = nn.AdaptiveAvgPool2d(output_size = (1, 1))
        self.fc1 = nn.Linear(512, 10)   #how many labels are needed in this task?
        # self.fc = nn.Linear(512 * 16 * 16, 512)
        
        
    def forward(self, x):
        x = self.conv1(x)
        #x = self.pool_1(x)
        x = F.relu(self.res1_1(x) + x)
        x = F.relu(self.res1_2(x) + x)
        
        
        x = F.relu(self.conv2(x) + self.sample_1(x))
        x = F.relu(self.res2_1(x) + x)


        x = F.relu(self.conv3(x) + self.sample_2(x))
        x = F.relu(self.res3_1(x) + x)


        x = F.relu(self.conv4(x) + self.sample_3(x))
        x = F.relu(self.res4_1(x) + x)

        # x = x.reshape((-1, 512*16*16))
        x = self.averagepool(x)
        # x = self.fc(x)
        x = x.reshape((-1, 512))
        x = self.fc1(x)
        return x

net = resnet18()
# net = torchvision.models.resnet18(num_classes=10)
# net.conv1 = nn.Conv2d(1, model.conv1.weight.shape[0], 3, 1, 1, bias = False)
# net.maxpool = nn.MaxPool2d(kernel_size = 1, stride = 1, padding = 0)

# PATH = "model_path"
# net.load_state_dict(torch.load(PATH)['state_dict'])
net.to(device)  # gpu/ cpu

import torch.optim as optim

criterion = nn.CrossEntropyLoss().to(device)
max_lr = 0.1
epochs = 100
optimizer = optim.SGD(net.parameters(), lr = max_lr, weight_decay = 0.0001, momentum = 0.9) 
grad_clip = 0.001
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs*len(trainloader))
criterion = criterion.cuda()
# sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs, 
#                                                 steps_per_epoch=len(trainloader))

import numpy as np 
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

train_accu, train_losses = [], []

for epoch in range(epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    correct=0
    total=0
    
    #for inputs, labels in combined_data:
    
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device) # inputs, labels = data
        if epoch <= 2/epochs:
            inputs, labels_a, labels_b, lam = mixup_data(inputs, labels, 0.2)
       
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        
        if epoch <= 2/epochs:
            loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam)
        else:
            loss = criterion(outputs, labels)
        
        loss.backward()
        nn.utils.clip_grad_value_(net.parameters(), grad_clip)
        
        optimizer.step()
        scheduler.step()
        # print statistics
        running_loss += loss.item()

        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss =running_loss/len(trainloader)
    accu=100.*correct/total

    train_accu.append(accu)
    train_losses.append(train_loss)
    if epoch % 10 == 0:
        my_lr = scheduler.get_last_lr()[0]
        avg_accu = sum(train_accu) / len(train_accu)
        avg_loss = sum(train_losses) / len(train_losses)
        print('Train Loss: %.3f | Accuracy: %.3f | lr: %f'%(avg_loss,avg_accu, my_lr))
        train_accu, train_losses = [], []
        
    if epoch % 20 == 0:
        net.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for data in testloader:
                images, labels = data
                images, labels = images.to(device), labels.to(device)
                
                outputs = net(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        accuracy = 100 * correct / total
        print('Accuracy of the network on the 10000 test images: {:.6f}%'.format(accuracy))
        net.train()
        
        
    
my_lr = scheduler.get_last_lr()[0]
print('Finished Training', "last_learning_rate", my_lr)

Train Loss: 1.299 | Accuracy: 36.902 | lr: 0.099975
Accuracy of the network on the 10000 test images: 71.510000%
Train Loss: 0.255 | Accuracy: 91.083 | lr: 0.097044
Train Loss: 0.103 | Accuracy: 96.421 | lr: 0.089508
Accuracy of the network on the 10000 test images: 86.760000%
Train Loss: 0.086 | Accuracy: 96.983 | lr: 0.078104
Train Loss: 0.069 | Accuracy: 97.602 | lr: 0.063950
Accuracy of the network on the 10000 test images: 87.830000%
Train Loss: 0.050 | Accuracy: 98.293 | lr: 0.048429
Train Loss: 0.031 | Accuracy: 98.978 | lr: 0.033063
Accuracy of the network on the 10000 test images: 89.300000%
Train Loss: 0.009 | Accuracy: 99.710 | lr: 0.019355
Train Loss: 0.000 | Accuracy: 99.998 | lr: 0.008646
Accuracy of the network on the 10000 test images: 91.660000%
Train Loss: 0.000 | Accuracy: 100.000 | lr: 0.001985
Finished Training last_learning_rate 0.0

# dataiter = iter(testloader)
# images, labels = next(dataiter)

# images, labels = images.to(device), labels.to(device)

# outputs = net(images)
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print('Accuracy of the network on the 10000 test images: {:.6f}%'.format(accuracy))

Accuracy of the network on the 10000 test images: 91.180000%

checkpoint = {'model': resnet18(),
              'state_dict': net.state_dict(),
              'optimizer' : optimizer.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')