Delete 'Baseline.py'

main
p5j3hxipf 9 months ago
parent 1dbfbf60db
commit 4e54057354

@ -1,989 +0,0 @@
# %%
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns
sns.set()
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR, CyclicLR
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from glob import glob
from skimage.io import imread
from os import listdir
import time
import copy
from tqdm import tqdm_notebook as tqdm
# %%
run_training = True
retrain = False
find_learning_rate = True
# %%
files = listdir("input/breast-histopathology-images/")
print(len(files))
# %%
files[0:10]
# %%
files = listdir("input/breast-histopathology-images/IDC_regular_ps50_idx5/")
len(files)
# %%
base_path = "input/breast-histopathology-images/IDC_regular_ps50_idx5/"
folder = listdir(base_path)
num_patients = 89 # 设定要使用的患者数量为 10
folder = folder[:num_patients] # 只取前 10 个患者
len(folder)
# %%
total_images = 0
for n in range(len(folder)):
patient_id = folder[n]
for c in [0, 1]:
patient_path = base_path + patient_id
class_path = patient_path + "/" + str(c) + "/"
subfiles = listdir(class_path)
total_images += len(subfiles)
# %%
total_images
# %%
data = pd.DataFrame(index=np.arange(0, total_images), columns=["patient_id", "path", "target"])
k = 0
for n in range(len(folder)):
patient_id = folder[n]
patient_path = base_path + patient_id
for c in [0,1]:
class_path = patient_path + "/" + str(c) + "/"
subfiles = listdir(class_path)
for m in range(len(subfiles)):
image_path = subfiles[m]
data.iloc[k]["path"] = class_path + image_path
data.iloc[k]["target"] = c
data.iloc[k]["patient_id"] = patient_id
k += 1
data.head()
# %%
data.shape
# %%
cancer_perc = data.groupby("patient_id").target.value_counts()/ data.groupby("patient_id").target.size()
cancer_perc = cancer_perc.unstack()
fig, ax = plt.subplots(1,3,figsize=(20,5))
sns.distplot(data.groupby("patient_id").size(), ax=ax[0], color="Orange", kde=False, bins=30)
ax[0].set_xlabel("Number of patches")
ax[0].set_ylabel("Frequency");
ax[0].set_title("How many patches do we have per patient?");
sns.distplot(cancer_perc.loc[:, 1]*100, ax=ax[1], color="Tomato", kde=False, bins=30)
ax[1].set_title("How much percentage of an image is covered by IDC?")
ax[1].set_ylabel("Frequency")
ax[1].set_xlabel("% of patches with IDC");
sns.countplot(data.target, palette="Set2", ax=ax[2]);
ax[2].set_xlabel("no(0) versus yes(1)")
ax[2].set_title("How many patches show IDC?");
# %%
data.target = data.target.astype(int)
# %%
pos_selection = np.random.choice(data[data.target==1].index.values, size=50, replace=False)
neg_selection = np.random.choice(data[data.target==0].index.values, size=50, replace=False)
# %%
fig, ax = plt.subplots(5,10,figsize=(20,10))
for n in range(5):
for m in range(10):
idx = pos_selection[m + 10*n]
image = imread(data.loc[idx, "path"])
ax[n,m].imshow(image)
ax[n,m].grid(False)
# %%
fig, ax = plt.subplots(5,10,figsize=(20,10))
for n in range(5):
for m in range(10):
idx = neg_selection[m + 10*n]
image = imread(data.loc[idx, "path"])
ax[n,m].imshow(image)
ax[n,m].grid(False)
# %%
def extract_coords(df):
coord = df.path.str.rsplit("_", n=4, expand=True)
coord = coord.drop([0, 1, 4], axis=1)
coord = coord.rename({2: "x", 3: "y"}, axis=1)
coord.loc[:, "x"] = coord.loc[:,"x"].str.replace("x", "", case=False).astype(int)
coord.loc[:, "y"] = coord.loc[:,"y"].str.replace("y", "", case=False).astype(int)
df.loc[:, "x"] = coord.x.values
df.loc[:, "y"] = coord.y.values
return df
def get_cancer_dataframe(patient_id, cancer_id):
path = base_path + patient_id + "/" + cancer_id
files = listdir(path)
dataframe = pd.DataFrame(files, columns=["filename"])
path_names = path + "/" + dataframe.filename.values
dataframe = dataframe.filename.str.rsplit("_", n=4, expand=True)
dataframe.loc[:, "target"] = int(cancer_id)
dataframe.loc[:, "path"] = path_names
dataframe = dataframe.drop([0, 1, 4], axis=1)
dataframe = dataframe.rename({2: "x", 3: "y"}, axis=1)
dataframe.loc[:, "x"] = dataframe.loc[:,"x"].str.replace("x", "", case=False).astype(int)
dataframe.loc[:, "y"] = dataframe.loc[:,"y"].str.replace("y", "", case=False).astype(int)
return dataframe
def get_patient_dataframe(patient_id):
df_0 = get_cancer_dataframe(patient_id, "0")
df_1 = get_cancer_dataframe(patient_id, "1")
patient_df = pd.concat([df_0, df_1], ignore_index=True)
return patient_df
# %%
example = get_patient_dataframe(data.patient_id.values[0])
example.head()
# %%
fig, ax = plt.subplots(5, 3, figsize=(20, 27))
patient_ids = data.patient_id.unique()
for n in range(5):
for m in range(3):
patient_id = patient_ids[m + 3 * n]
example_df = get_patient_dataframe(patient_id)
ax[n, m].scatter(example_df.x.values, example_df.y.values, c=example_df.target.values, cmap="coolwarm", s=20);
ax[n, m].set_title("patient " + patient_id)
ax[n, m].set_xlabel("y coord")
ax[n, m].set_ylabel("x coord")
# %%
def visualise_breast_tissue(patient_id, pred_df=None):
example_df = get_patient_dataframe(patient_id)
max_point = [example_df.y.max() - 1, example_df.x.max() - 1]
grid = 255 * np.ones(shape=(max_point[0] + 50, max_point[1] + 50, 3)).astype(np.uint8)
mask = 255 * np.ones(shape=(max_point[0] + 50, max_point[1] + 50, 3)).astype(np.uint8)
if pred_df is not None:
patient_df = pred_df[pred_df.patient_id == patient_id].copy()
mask_proba = np.zeros(shape=(max_point[0] + 50, max_point[1] + 50, 1)).astype(float)
broken_patches = []
for n in range(len(example_df)):
try:
image = imread(example_df.path.values[n])
target = example_df.target.values[n]
x_coord = int(example_df.x.values[n])
y_coord = int(example_df.y.values[n])
x_start = x_coord - 1
y_start = y_coord - 1
x_end = x_start + 50
y_end = y_start + 50
grid[y_start:y_end, x_start:x_end] = image
if target == 1:
mask[y_start:y_end, x_start:x_end, 0] = 250
mask[y_start:y_end, x_start:x_end, 1] = 0
mask[y_start:y_end, x_start:x_end, 2] = 0
if pred_df is not None:
proba = patient_df[
(patient_df.x == x_coord) & (patient_df.y == y_coord)].proba
mask_proba[y_start:y_end, x_start:x_end, 0] = float(proba)
except ValueError:
broken_patches.append(example_df.path.values[n])
return grid, mask, broken_patches, mask_proba
# %%
example = "8863"
grid, mask, broken_patches,_ = visualise_breast_tissue(example)
fig, ax = plt.subplots(1,2,figsize=(20,10))
ax[0].imshow(grid, alpha=0.9)
ax[1].imshow(mask, alpha=0.8)
ax[1].imshow(grid, alpha=0.7)
ax[0].grid(False)
ax[1].grid(False)
for m in range(2):
ax[m].set_xlabel("y-coord")
ax[m].set_ylabel("y-coord")
ax[0].set_title("Breast tissue slice of patient: " + patient_id)
ax[1].set_title("Cancer tissue colored red \n of patient: " + patient_id);
# %%
broken_patches
# %%
BATCH_SIZE = 8
NUM_CLASSES = 2
OUTPUT_PATH = ""
MODEL_PATH = "input/breastcancermodel/"
LOSSES_PATH = "input/breastcancermodel/"
# %%
torch.manual_seed(0)
np.random.seed(0)
# %%
data.head()
data.loc[:, "target"] = data.target.astype(str)
data.info()
# %%
patients = data.patient_id.unique()
train_ids, sub_test_ids = train_test_split(patients,
test_size=0.3,
random_state=0)
test_ids, dev_ids = train_test_split(sub_test_ids, test_size=0.5, random_state=0)
# %%
print(len(train_ids)/patients.shape[0]*100, len(dev_ids)/patients.shape[0]*100, len(test_ids)/patients.shape[0]*100)
# %%
print(len(train_ids), len(dev_ids), len(test_ids))
# %%
train_df = data.loc[data.patient_id.isin(train_ids),:].copy()
test_df = data.loc[data.patient_id.isin(test_ids),:].copy()
dev_df = data.loc[data.patient_id.isin(dev_ids),:].copy()
train_df = extract_coords(train_df)
test_df = extract_coords(test_df)
dev_df = extract_coords(dev_df)
# %%
fig, ax = plt.subplots(1,3,figsize=(20,5))
sns.countplot(train_df.target, ax=ax[0], palette="Reds")
ax[0].set_title("Train data")
sns.countplot(dev_df.target, ax=ax[1], palette="Blues")
ax[1].set_title("Dev data")
sns.countplot(test_df.target, ax=ax[2], palette="Greens");
ax[2].set_title("Test data");
# %%
def my_transform(key="train", plot=False):
train_sequence = [transforms.Resize((50, 50)),
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip()]
val_sequence = [transforms.Resize((50, 50))]
if plot == False:
train_sequence.extend([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
val_sequence.extend([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
data_transforms = {'train': transforms.Compose(train_sequence), 'val': transforms.Compose(val_sequence)}
return data_transforms[key]
# %%
class BreastCancerDataset(Dataset):
def __init__(self, df, transform=None):
self.states = df
self.transform = transform
def __len__(self):
return len(self.states)
def __getitem__(self, idx):
patient_id = self.states.patient_id.values[idx]
x_coord = self.states.x.values[idx]
y_coord = self.states.y.values[idx]
image_path = self.states.path.values[idx]
image = Image.open(image_path)
image = image.convert('RGB')
if self.transform:
image = self.transform(image)
if "target" in self.states.columns.values:
target = int(self.states.target.values[idx])
else:
target = None
return {"image": image,
"label": target,
"patient_id": patient_id,
"x": x_coord,
"y": y_coord}
# %%
train_dataset = BreastCancerDataset(train_df, transform=my_transform(key="train"))
dev_dataset = BreastCancerDataset(dev_df, transform=my_transform(key="val"))
test_dataset = BreastCancerDataset(test_df, transform=my_transform(key="val"))
# %%
image_datasets = {"train": train_dataset, "dev": dev_dataset, "test": test_dataset}
dataset_sizes = {x: len(image_datasets[x]) for x in ["train", "dev", "test"]}
# %%
fig, ax = plt.subplots(3,6,figsize=(20,11))
train_transform = my_transform(key="train", plot=True)
val_transform = my_transform(key="val", plot=True)
for m in range(6):
filepath = train_df.path.values[m]
image = Image.open(filepath)
ax[0,m].imshow(image)
transformed_img = train_transform(image)
ax[1,m].imshow(transformed_img)
ax[2,m].imshow(val_transform(image))
ax[0,m].grid(False)
ax[1,m].grid(False)
ax[2,m].grid(False)
ax[0,m].set_title(train_df.patient_id.values[m] + "\n target: " + train_df.target.values[m])
ax[1,m].set_title("Preprocessing for train")
ax[2,m].set_title("Preprocessing for val")
# %%
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
# %%
dataloaders = {"train": train_dataloader, "dev": dev_dataloader, "test": test_dataloader}
# %%
print(len(dataloaders["train"]), len(dataloaders["dev"]), len(dataloaders["test"]))
# %%
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
# %%
import torch
import torch.nn as nn
import torch.nn.functional as F
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet18(nn.Module):
def __init__(self, num_classes=2):
super(ResNet18, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(64, 2, stride=1)
self.layer2 = self._make_layer(128, 2, stride=2)
self.layer3 = self._make_layer(256, 2, stride=2)
self.layer4 = self._make_layer(512, 2, stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512, num_classes)
self._initialize_weights()
def _make_layer(self, out_channels, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != out_channels:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels),
)
layers = [BasicBlock(self.in_channels, out_channels, stride, downsample)]
self.in_channels = out_channels
for _ in range(1, blocks):
layers.append(BasicBlock(out_channels, out_channels))
return nn.Sequential(*layers)
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
def forward(self, x):
x = self.relu(self.bn1(self.conv1(x)))
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
# 创建模型实例
NUM_CLASSES = 2 # 根据你的需求修改分类数
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNet18(num_classes=NUM_CLASSES).to(device)
# %%
# %%
weights = compute_class_weight(y=train_df.target.values, class_weight="balanced", classes=train_df.target.unique())
class_weights = torch.FloatTensor(weights)
if device.type=="cuda":
class_weights = class_weights.cuda()
print(class_weights)
# %%
train_df.target.unique()
# %%
criterion = nn.CrossEntropyLoss(weight=class_weights)
# %%
def f1_score(preds, targets):
tp = (preds * targets).sum().to(torch.float32)
fp = ((1 - targets) * preds).sum().to(torch.float32)
fn = (targets * (1 - preds)).sum().to(torch.float32)
epsilon = 1e-7
precision = tp / (tp + fp + epsilon)
recall = tp / (tp + fn + epsilon)
f1_score = 2 * precision * recall / (precision + recall + epsilon)
return f1_score
# %%
def train_loop(model, criterion, optimizer, lr_find=False, scheduler=None, num_epochs=3, lam=0.0):
since = time.time()
if lr_find:
phases = ["train"]
else:
phases = ["train", "dev", "test"]
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
loss_dict = {"train": [], "dev": [], "test": []}
lam_tensor = torch.tensor(lam, device=device)
running_loss_dict = {"train": [], "dev": [], "test": []}
lr_find_loss = []
lr_find_lr = []
smoothing = 0.2
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
for phase in phases:
if phase == "train":
model.train()
else:
model.eval()
running_loss = 0.0
running_corrects = 0
tk0 = tqdm(dataloaders[phase], total=int(len(dataloaders[phase])))
counter = 0
for bi, d in enumerate(tk0):
inputs = d["image"]
labels = d["label"]
inputs = inputs.to(device, dtype=torch.float)
labels = labels.to(device, dtype=torch.long)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
# l2_reg = torch.tensor(0., device=device)
# for param in model.parameters():
# l2_reg = lam_tensor * torch.norm(param)
# loss += l2_reg
optimizer.step()
# cyclical lr schedule is invoked after each batch
if scheduler is not None:
scheduler.step()
if lr_find:
lr_step = optimizer.state_dict()["param_groups"][0]["lr"]
lr_find_lr.append(lr_step)
if counter == 0:
lr_find_loss.append(loss.item())
else:
smoothed_loss = smoothing * loss.item() + (1 - smoothing) * lr_find_loss[-1]
lr_find_loss.append(smoothed_loss)
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
counter += 1
tk0.set_postfix({'loss': running_loss / (counter * dataloaders[phase].batch_size),
'accuracy': running_corrects.double() / (counter * dataloaders[phase].batch_size)})
running_loss_dict[phase].append(running_loss / (counter * dataloaders[phase].batch_size))
epoch_loss = running_loss / dataset_sizes[phase]
loss_dict[phase].append(epoch_loss)
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'dev' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
results = {"model": model,
"loss_dict": loss_dict,
"running_loss_dict": running_loss_dict,
"lr_find": {"lr": lr_find_lr, "loss": lr_find_loss}}
return results
# %%
start_lr = 1e-6
end_lr = 0.1
# %%
def get_lr_search_scheduler(optimizer, min_lr, max_lr, max_iterations):
# max_iterations should be the number of steps within num_epochs_*epoch_iterations
# this way the learning rate increases linearily within the period num_epochs*epoch_iterations
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimizer,
base_lr=min_lr,
max_lr=max_lr,
step_size_up=max_iterations,
step_size_down=max_iterations,
mode="triangular")
return scheduler
def get_scheduler(optimiser, min_lr, max_lr, stepsize):
# suggested_stepsize = 2*num_iterations_within_epoch
stepsize_up = int(stepsize / 2)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimiser,
base_lr=min_lr,
max_lr=max_lr,
step_size_up=stepsize_up,
step_size_down=stepsize_up,
mode="triangular")
return scheduler
# %%
import math
if find_learning_rate:
lr_find_epochs = 1
optimizer = optim.SGD(model.fc.parameters(), start_lr)
scheduler = get_lr_search_scheduler(optimizer, start_lr, end_lr, lr_find_epochs * len(train_dataloader))
results = train_loop(model, criterion, optimizer, lr_find=True, scheduler=scheduler, num_epochs=lr_find_epochs)
lr_find_lr, lr_find_loss = results["lr_find"]["lr"], results["lr_find"]["loss"]
find_lr_df = pd.DataFrame(lr_find_loss, columns=["smoothed loss"])
find_lr_df.loc[:, "lr"] = lr_find_lr
find_lr_df.to_csv("learning_rate_search.csv", index=False)
else:
find_lr_df = pd.read_csv(MODEL_PATH + "learning_rate_search.csv")
# %%
fig, ax = plt.subplots(1,2,figsize=(20,5))
ax[0].plot(find_lr_df.lr.values)
ax[1].plot(find_lr_df["smoothed loss"].values)
ax[0].set_xlabel("Steps")
ax[0].set_ylabel("Learning rate")
ax[1].set_xlabel("Steps")
ax[1].set_ylabel("Loss");
ax[0].set_title("How the learning rate increases during search")
ax[1].set_title("How the training loss evolves during search")
plt.figure(figsize=(20,5))
plt.plot(find_lr_df.lr.values, find_lr_df["smoothed loss"].values, '-', color="tomato");
plt.xlabel("Learning rate")
plt.xscale("log")
plt.ylabel("Smoothed Loss")
plt.title("Searching for the optimal learning rate");
# %%
start_lr = 1e-6
end_lr = 0.006
# %%
if run_training:
NUM_EPOCHS = 2
optimizer = optim.SGD(model.fc.parameters(), lr=0.01)
scheduler = get_scheduler(optimizer, start_lr, end_lr, 2 * NUM_EPOCHS)
results = train_loop(model, criterion, optimizer, scheduler=scheduler, num_epochs=NUM_EPOCHS)
model, loss_dict, running_loss_dict = results["model"], results["loss_dict"], results["running_loss_dict"]
if device == "cpu":
OUTPUT_PATH += ".pth"
else:
OUTPUT_PATH += "_cuda.pth"
torch.save(model.state_dict(), OUTPUT_PATH)
losses_df = pd.DataFrame(loss_dict["train"], columns=["train"])
losses_df.loc[:, "dev"] = loss_dict["dev"]
losses_df.loc[:, "test"] = loss_dict["test"]
losses_df.to_csv("losses_breastcancer.csv", index=False)
running_losses_df = pd.DataFrame(running_loss_dict["train"], columns=["train"])
running_losses_df.loc[0:len(running_loss_dict["dev"]) - 1, "dev"] = running_loss_dict["dev"]
running_losses_df.loc[0:len(running_loss_dict["test"]) - 1, "test"] = running_loss_dict["test"]
running_losses_df.to_csv("running_losses_breastcancer.csv", index=False)
else:
if device == "cpu":
load_path = MODEL_PATH + ".pth"
else:
load_path = MODEL_PATH + "_cuda.pth"
model.load_state_dict(torch.load(load_path, map_location='cpu'))
model.eval()
losses_df = pd.read_csv(LOSSES_PATH + "losses_breastcancer.csv")
running_losses_df = pd.read_csv(LOSSES_PATH + "running_losses_breastcancer.csv")
# %%
plt.figure(figsize=(20,5))
plt.plot(losses_df["train"], '-o', label="train")
plt.plot(losses_df["dev"], '-o', label="dev")
plt.plot(losses_df["test"], '-o', label="dev")
plt.xlabel("Epoch")
plt.ylabel("Weighted x-entropy")
plt.title("Loss change over epoch")
plt.legend();
# %%
fig, ax = plt.subplots(3,1,figsize=(20,15))
ax[0].plot(running_losses_df["train"], '-o', label="train")
ax[0].set_xlabel("Step")
ax[0].set_ylabel("Weighted x-entropy")
ax[0].set_title("Loss change over steps")
ax[0].legend();
ax[1].plot(running_losses_df["dev"], '-o', label="dev", color="orange")
ax[1].set_xlabel("Step")
ax[1].set_ylabel("Weighted x-entropy")
ax[1].set_title("Loss change over steps")
ax[1].legend();
ax[2].plot(running_losses_df["test"], '-o', label="test", color="mediumseagreen")
ax[2].set_xlabel("Step")
ax[2].set_ylabel("Weighted x-entropy")
ax[2].set_title("Loss change over steps")
ax[2].legend();
# %%
def sigmoid(x):
return 1. / (1 + np.exp(-x))
def evaluate_model(model, predictions_df, key):
was_training = model.training
model.eval()
with torch.no_grad():
for i, data in enumerate(dataloaders[key]):
inputs = data["image"].to(device)
labels = data["label"].to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
proba = outputs.cpu().numpy().astype(float)
predictions_df.loc[i * BATCH_SIZE:(i + 1) * BATCH_SIZE - 1, "proba"] = sigmoid(proba[:, 1])
predictions_df.loc[i * BATCH_SIZE:(i + 1) * BATCH_SIZE - 1, "true"] = data["label"].numpy().astype(int)
predictions_df.loc[i * BATCH_SIZE:(i + 1) * BATCH_SIZE - 1, "predicted"] = preds.cpu().numpy().astype(
int)
predictions_df.loc[i * BATCH_SIZE:(i + 1) * BATCH_SIZE - 1, "x"] = data["x"].numpy()
predictions_df.loc[i * BATCH_SIZE:(i + 1) * BATCH_SIZE - 1, "y"] = data["y"].numpy()
predictions_df.loc[i * BATCH_SIZE:(i + 1) * BATCH_SIZE - 1, "patient_id"] = data["patient_id"]
predictions_df = predictions_df.dropna()
return predictions_df
# %%
if run_training:
dev_predictions = pd.DataFrame(index=np.arange(0, dataset_sizes["dev"]), columns=["true", "predicted", "proba"])
test_predictions = pd.DataFrame(index=np.arange(0, dataset_sizes["test"]), columns=["true", "predicted", "proba"])
dev_predictions = evaluate_model(model, dev_predictions, "dev")
test_predictions = evaluate_model(model, test_predictions, "test")
dev_predictions.to_csv("dev_predictions.csv", index=False)
test_predictions.to_csv("test_predictions.csv", index=False)
else:
dev_predictions = pd.read_csv(LOSSES_PATH + "dev_predictions.csv")
test_predictions = pd.read_csv(LOSSES_PATH + "test_predictions.csv")
dev_predictions.patient_id = dev_predictions.patient_id.astype(str)
# %%
fig, ax = plt.subplots(3, 3, figsize=(20, 20))
print("Unique patient IDs in dev_predictions:")
print(dev_predictions['patient_id'].unique())
for n in range(3):
idx = dev_predictions.patient_id.unique()[n]
grid, mask, broken_patches, mask_proba = visualise_breast_tissue(idx, pred_df=dev_predictions)
ax[n, 0].imshow(grid, alpha=0.9)
ax[n, 1].imshow(mask, alpha=0.8)
ax[n, 1].imshow(grid, alpha=0.7)
ax[n, 2].imshow(mask_proba[:, :, 0], cmap="YlOrRd")
for m in range(3):
ax[n, m].set_xlabel("y-coord")
ax[n, m].set_ylabel("x-coord")
ax[n, m].grid(False)
ax[n, 0].set_title("Breast tissue slice of patient: " + patient_id)
ax[n, 1].set_title("Cancer tissue colored red \n of patient: " + patient_id);
ax[n, 2].set_title("Cancer probability");
# %%
dev_predictions.head()
# %%
fig, ax = plt.subplots(1,3,figsize=(20,5))
sns.countplot(dev_predictions.true.astype(float), ax=ax[0], palette="Reds_r")
ax[0].set_title("Target counts of dev data");
sns.distplot(dev_predictions.proba.astype(float), ax=ax[1], kde=False, color="tomato")
ax[0].set_title("Predicted probability of cancer in dev");
sns.distplot(test_predictions.proba.astype(float), ax=ax[2], kde=False, color="mediumseagreen");
ax[2].set_title("Predicted probability of cancer in test");
# %%
from sklearn.metrics import confusion_matrix
def get_confusion_matrix(y_true, y_pred):
transdict = {1: "cancer", 0: "no cancer"}
y_t = np.array([transdict[x] for x in y_true])
y_p = np.array([transdict[x] for x in y_pred])
labels = ["no cancer", "cancer"]
index_labels = ["actual no cancer", "actual cancer"]
col_labels = ["predicted no cancer", "predicted cancer"]
confusion = confusion_matrix(y_t, y_p, labels=labels)
confusion_df = pd.DataFrame(confusion, index=index_labels, columns=col_labels)
for n in range(2):
confusion_df.iloc[n] = confusion_df.iloc[n] / confusion_df.sum(axis=1).iloc[n]
return confusion_df
# %%
Loading…
Cancel
Save