Fundamentals of Medical Image Processing for Deep Learning
- Loading the Data
- Conversion in Hounsfeild Units
- Windowing
- Histogram Analysis
- Storing Metadata in Dataframe
- Voxel Size and Volume
- Resampling
- 3D Plotting
- Segmentation
import numpy as np
import pandas as pd
import scipy.ndimage
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
from sklearn.cluster import KMeans
import shutil
import cv2
import pydicom
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)
import seaborn as sns
from IPython.display import HTML
import gdcm
import os
def set_options():
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 1000)
set_options()
TRAIN_PATH = 'osic-pulmonary-fibrosis-progression/train.csv'
TRAIN_IMG_PATH = 'osic-pulmonary-fibrosis-progression/train'
TEST_PATH = 'osic-pulmonary-fibrosis-progression/test.csv'
TEST_IMG_PATH = 'osic-pulmonary-fibrosis-progression/test'
SUBMISSION_PATH = 'osic-pulmonary-fibrosis-progression/sample_submission.csv'
''' Load CSV data'''
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
train[train.duplicated(['Patient', 'Weeks'], keep = False)]
train = train.drop_duplicates()
train.head(10)
HTML('<iframe width="600" height="400" src="https://www.youtube.com/embed/KZld-5W99cI" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')
# In this dataset the ImagePosition is not available so we'll sort the slices on Instance Number
def load_slices(path):
filenames = os.listdir(path)
slices = [pydicom.dcmread(f'{path}/{file}') for file in filenames]
slices.sort(key = lambda x: int(x.InstanceNumber), reverse=True)
return slices
scans = load_slices(f'{TRAIN_IMG_PATH}/ID00007637202177411956430')
print(len(scans))
# here is the first slice of dicom
scans[0]
Some Meta Data Which is useful for processing
- Slice Thickness: it is distance b/w two slices, further useful for getting the volume of the image
- Image Orientation (patient) and Slice Location comes in use when the Slice Thickness is not given in the dataset so, slice thickness = difference b/w slice location of any two slices
- Pixel Spacing is vimp metadata, it gives pixel size x and y coordinates
- window center and the window window width are vimp as they give region of interest
def get_pixels_hu(slices):
image = np.stack([s.pixel_array for s in slices])
image = image.astype(np.int16)
# Set outside-of-scan pixels to 0
# The intercept is usually -1024, so air is approximately 0
image[image <= -1000] = 0
# convert to HU
for slice_number in range(len(slices)):
intercept = slices[slice_number].RescaleIntercept
slope = slices[slice_number].RescaleSlope
if slope != 1:
image[slice_number] = slope * image[slice_number].astype(np.float64)
image[slice_number] = image[slice_number].astype(np.int16)
image[slice_number] += np.int16(intercept)
return np.array(image, dtype=np.int16)
Windowing
windowing is the seletion of range of range of greyscale in the image that we need, basically it gives the region of interest using parameters like window width and window level(center)
window width - greyscale which can be displayed on the image, we do not want the black color bcoz that is the air, we do not want that density to show, so we adjust these greyscale range so that we can just get only the lung portion of that.
window center or level - center of that particular range of that window width
in our case WW of 100 HU could mean the grayscale only ranges from 0HU to +100 HU, with a WL of +50 HU. For Lungs window width is 1500 and window level is -600, so grayscale ranges from 150HU to -1350HU. More about windowing can be read here. How to get window width and level is explained briefly in this article.
def apply_window(hu_image, center, width):
hu_image = hu_image.copy()
min_value = center - width // 2
max_value = center + width // 2
hu_image[hu_image < min_value] = min_value
hu_image[hu_image > max_value] = max_value
return hu_image
train.loc[0]['Patient']
fig, ax = plt.subplots(1, 2, figsize= (20, 5))
example = train.loc[0]['Patient']
scans = load_slices(f'{TRAIN_IMG_PATH}/{example}')
rescaled_images = get_pixels_hu(scans)
images = [scan.pixel_array for scan in scans]
for i in range(10):
sns.histplot(images[i].flatten(), ax=ax[0])
sns.histplot(rescaled_images[i].flatten(), ax=ax[1])
ax[0].set_title("Raw pixel array distributions for 10 examples")
ax[1].set_title("HU unit distributions for 10 examples")
plt.show()
def get_dicom_raw(dicom):
return ({attr:getattr(dicom, attr) for attr in dir(dicom) if attr[0].isupper() and attr not in ['PixelData']})
%%time
# Get dicom metadata
# Image features like lung volume are implementation from a detailed discussion "Domain expert's insight" by Dr. Konya.
# https://www.kaggle.com/c/osic-pulmonary-fibrosis-progression/discussion/165727
def get_dicom_metadata(df):
patients = df.Patient.unique()
dicom_metadata = []
for patient in patients:
path = f'{TRAIN_IMG_PATH}/{patient}'
img_list = os.listdir(path)
for img in img_list:
image = pydicom.dcmread(f'{path}/{img}')
record = get_dicom_raw(image)
raw = image.pixel_array
pixelspacing_r, pixelspacing_c = image.PixelSpacing[0], image.PixelSpacing[1]
row_distance = pixelspacing_r * image.Rows
col_distance = pixelspacing_c * image.Columns
record.update({'raw_min':raw.min(),
'raw_max':raw.max(),
'raw_mean':raw.mean(),
'raw_std':raw.std(),
'raw_diff':raw.max()-raw.min(),
'pixel_spacing_area':pixelspacing_r * pixelspacing_c,
'img_area':image.Rows * image.Columns,
'pixel_row_distance':row_distance,
'pixel_col_distance':col_distance,
'slice_area_cm2':(0.1 * row_distance) * (0.1 * col_distance),
'slice_vol_cm3':(0.1 * image.SliceThickness) * (0.1 * row_distance) * (0.1 * col_distance),
'patient_img_path':f'{path}/{img}'})
dicom_metadata.append(record)
metadata_df = pd.DataFrame(dicom_metadata)
metadata_df.to_pickle('metadata_df.pkl')
return metadata_df
metadata_df = get_dicom_metadata(train.copy())
metadata_df.head()
plt.tight_layout()
fig, ax = plt.subplots(2, 2, figsize=(20,10))
sns.histplot(metadata_df.pixel_row_distance, ax = ax[0,0], color='green')
sns.histplot(metadata_df.pixel_col_distance, ax = ax[0,1], color='blue')
sns.histplot(metadata_df.slice_area_cm2, ax=ax[1,0], color='pink')
sns.histplot(metadata_df.slice_vol_cm3, ax=ax[1, 1], color='magenta')
ax[0,0].set_title("Pixel Rows Distace")
ax[0,0].set_xlabel("Pixel Rows")
ax[0,1].set_title("Pixel Column Distance")
ax[0,1].set_xlabel("Pixel Columns")
ax[1,0].set_title("CT-slice area in $cm^{2}$")
ax[1,0].set_xlabel("Area in $cm^{2}$")
ax[1,1].set_title("CT-slice volume in $cm^{3}$")
ax[1,1].set_xlabel("Volume in $cm^{3}$")
plt.show(ax[0,0])
highest_vol_patients = list(metadata_df[metadata_df.slice_vol_cm3 == max(metadata_df.slice_vol_cm3)]['PatientID'])
lowest_vol_patients = list(metadata_df[metadata_df.slice_vol_cm3 == min(metadata_df.slice_vol_cm3)]['PatientID'])
#load scans
max_vol_scans = load_slices(f"{TRAIN_IMG_PATH}/{highest_vol_patients[0]}")
min_vol_scans = load_slices(f"{TRAIN_IMG_PATH}/{lowest_vol_patients[0]}")
#convert to HU
max_vol_hu_imgs = get_pixels_hu(max_vol_scans)
min_vol_hu_imgs = get_pixels_hu(min_vol_scans)
#Apply Windowing
# we can try with different window width and levels
max_vol_window_img = apply_window(max_vol_hu_imgs[20], -600, 1200)
min_vol_window_img = apply_window(min_vol_hu_imgs[18], -600, 1200)
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
ax[0].imshow(max_vol_window_img, cmap='YlGnBu')
ax[0].set_title("CT with large volume")
ax[1].imshow(min_vol_window_img, cmap='YlGnBu')
ax[1].set_title("CT with min volume")
plt.show(ax[0], ax[1])
metadata_df.SliceThickness.unique()
#so when slice thickness is different then the voxel size of two images is also very much different
patient1 = train.Patient.unique()[0]
patient2 = train.Patient.unique()[5]
scans1 = load_slices(f"{TRAIN_IMG_PATH}/{patient1}")
scans2 = load_slices(f"{TRAIN_IMG_PATH}/{patient2}")
print(f"{scans1[0].SliceThickness}, {scans1[0].PixelSpacing}")
print(f"{scans2[0].SliceThickness}, {scans2[0].PixelSpacing}")
patient1_hu_scans = get_pixels_hu(scans1)
patient2_hu_scans = get_pixels_hu(scans2)
def resample(image, scan, new_spacing=[1,1,1]):
# Determine current pixel spacing
spacing = np.array([scan[0].SliceThickness] + list(scan[0].PixelSpacing), dtype=np.float32)
resize_factor = spacing / new_spacing
new_real_shape = image.shape * resize_factor
new_shape = np.round(new_real_shape)
real_resize_factor = new_shape / image.shape
new_spacing = spacing / real_resize_factor
image = scipy.ndimage.interpolation.zoom(image, real_resize_factor)
return image, new_spacing
image1, rounded_new_spacing1 = resample(patient1_hu_scans, scans1, [1,1,1])
image2, rounded_new_spacing2 = resample(patient2_hu_scans, scans2, [1,1,1])
print(f"Original shape : {patient2_hu_scans.shape}")
print(f"Shape after resampling : {image2.shape}")
def plot_3d(image,threshold=800):
# Position the scan upright,
# so the head of the patient would be at the top facing the
# camera
p = image.transpose(2,1,0)
verts, faces, _, _ = measure.marching_cubes_lewiner(p, threshold)
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
# Fancy indexing: `verts[faces]` to generate a collection of
# triangles
mesh = Poly3DCollection(verts[faces], alpha=0.70)
face_color = [0.35, 0.65, 0.75]
mesh.set_facecolor(face_color)
ax.add_collection3d(mesh)
ax.set_xlim(0, p.shape[0])
ax.set_ylim(0, p.shape[1])
ax.set_zlim(0, p.shape[2])
plt.show()
plot_3d(image1)
plot_3d(patient1_hu_scans)
def make_lungmask(img, display=False):
row_size = img.shape[0]
col_size = img.shape[1]
mean = np.mean(img)
std = np.std(img)
img = img-mean
img = img/std
# Find the average pixel value near the lungs
# to renormalize washed out images
middle = img[int(col_size/5):int(col_size/5*4), int(row_size/5):int(row_size/5*4)]
mean = np.mean(middle)
max = np.max(img)
min = np.min(img)
# to improve threshold finding, i'm moving the underflow and overflow on the pixel spectrum
img[img==max] = mean
img[img==min] = mean
# using kmeans to seperate foreground ( soft tissue / bone) and background (lung/air)
kmeans = KMeans(n_clusters=2).fit(np.reshape(middle,[np.prod(middle.shape),1]))
centers = sorted(kmeans.cluster_centers_.flatten())
threshold = np.mean(centers)
# Threshold the image and the o/p will be a binary image, morphology works either on binary or gray scale images
thresh_img = np.where(img<threshold, 1.0, 0.0)
eroded = morphology.erosion(thresh_img, np.ones([3,3]))
dilation = morphology.dilation(eroded, np.ones([8,8]))
labels = measure.label(dilation) # different labels are displayed in different colors
label_vals = np.unique(labels)
regions = measure.regionprops(labels)
good_labels = []
for prop in regions:
B = prop.bbox
if B[2]-B[0]<row_size/10*9 and B[3]-B[1]<col_size/10*9 and B[0]>row_size/5 and B[2]<col_size/5*4:
good_labels.append(prop.label)
mask = np.ndarray([row_size, col_size], dtype=np.int8)
mask[:]=0
# after just the lungs are left, we do another large dilation in prder to fill in and out lung mask
for N in good_labels:
mask = mask + np.where(labels==N,1,0)
mask = morphology.dilation(mask,np.ones([10,10])) #last dilation
if (display):
fig, ax = plt.subplots(3, 2, figsize=[12, 12])
ax[0, 0].set_title("Original")
ax[0, 0].imshow(img, cmap='gray')
ax[0, 0].axis('off')
ax[0, 1].set_title("Threshold")
ax[0, 1].imshow(thresh_img, cmap='gray')
ax[0, 1].axis('off')
ax[1, 0].set_title("After Erosion and Dilation")
ax[1, 0].imshow(dilation, cmap='gray')
ax[1, 0].axis('off')
ax[1, 1].set_title("Color Labels")
ax[1, 1].imshow(labels)
ax[1, 1].axis('off')
ax[2, 0].set_title("Final Mask")
ax[2, 0].imshow(mask, cmap='gray')
ax[2, 0].axis('off')
ax[2, 1].set_title("Apply Mask on Original")
ax[2, 1].imshow(mask*img, cmap='gray')
ax[2, 1].axis('off')
plt.show()
return mask*img
make_lungmask(image1[14], True)
def get_rows_cols(size):
cols = 6
rows = size // cols
if (int(size%cols) != 0):
rows = rows+1
return rows,cols
def plot_stack(stack, start_with=10, show_every=3):
size = (len(stack) - (start_with - 1))//show_every
rows, cols = get_rows_cols(size)
plt.tight_layout()
fig,ax = plt.subplots(rows,cols,figsize=[12,12])
for i in range(size-1):
ind = start_with + i*show_every
ax[int(i/cols),int(i % cols)].set_title('slice %d' % ind)
ax[int(i/cols),int(i % cols)].imshow(stack[ind],cmap='gray')
ax[int(i/cols),int(i % cols)].axis('off')
plt.show()
plot_stack(patient1_hu_scans, start_with=0, show_every=1)
masked_lung = []
for img in image1:
masked_lung.append(make_lungmask(img))
plot_stack(masked_lung, start_with=0, show_every=1)
To Save Segmented image to png/npz format
path = "./segmented-images" if not shutil.os.path.isdir(path): shutil.os.mkdir(path)
patients = train.Patient.unique()[0:10] for patient in patients:
#if not shutil.os.path.isdir(path + "/" + patient):
# shutil.os.mkdir(path + "/" + patient)
scans = load_slices(f'{TRAIN_IMG_PATH}/{patient}')
hu_imgs = get_pixels_hu(scans)
rescaled_images, spacing = resample(hu_imgs, scans,[1,1,1])
masked_lung = []
for img_number in range(len(rescaled_images)):
window_img = apply_window(rescaled_images[img_number], -600, 1200)
masked_img = make_lungmask(window_img)
masked_lung.append(masked_img)
#cv2.imwrite(f'{path}/{patient}/{img_number + 1}.png', masked_img)
# Comment the below line if images required to store in .png format.
np.savez(f'{path}/{patient}',masked_lung)