Skip to content
Snippets Groups Projects
Commit 3b1fd636 authored by Roel Vrooman's avatar Roel Vrooman
Browse files

code testing

parent f6d63860
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
**Here the functions are defined to perform the map conversions. Please note that these functions require some input provided on the gitlab** **Here the functions are defined to perform the map conversions. Please note that these functions require some input provided on the gitlab**
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Import all necessary packages #Import all necessary packages
import pandas as pd import pandas as pd
import abagen import abagen
import allensdk import allensdk
from allensdk.api.queries.grid_data_api import GridDataApi from allensdk.api.queries.grid_data_api import GridDataApi
import os import os
import SimpleITK as sitk import SimpleITK as sitk
import numpy as np import numpy as np
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler
from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache
from scipy.ndimage import zoom from scipy.ndimage import zoom
import nibabel as nib
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def get_all_data(asset_dir='Data_folder'): def get_all_data(asset_dir='Data_folder'):
"""Download both the mouse and human data from the ABA and processes them into arrays usable arrays """Download both the mouse and human data from the ABA and processes them into arrays usable arrays
!!WARNING!! This process will take a lot of time. Total download is roughly 18GB. !!WARNING!! This process will take a lot of time. Total download is roughly 18GB.
asset_dir = path to desired output folder for the data (default = 'Data_folder') asset_dir = path to desired output folder for the data (default = 'Data_folder')
Within the output folder several files are generated: Within the output folder several files are generated:
HUMAN HUMAN
For each donor (6 in total), there will be folders containing the microarray data as well as important meta data (all in .csv). For each donor (6 in total), there will be folders containing the microarray data as well as important meta data (all in .csv).
This function processes this data and generates two extra .csv files, namely the expression arrays (one containing This function processes this data and generates two extra .csv files, namely the expression arrays (one containing
colums/headers with information and one with just the expression values, cleaned_data and cleaned_data_noinf respectively). colums/headers with information and one with just the expression values, cleaned_data and cleaned_data_noinf respectively).
After further processing and parcellating the human data it will also generate the final expression per region files (again After further processing and parcellating the human data it will also generate the final expression per region files (again
one with and one without columns/header info) one with and one without columns/header info)
MOUSE MOUSE
The mouse data will be dowloaded into a folder per gene. Within this folder a gene vector will also be saved to be combined The mouse data will be dowloaded into a folder per gene. Within this folder a gene vector will also be saved to be combined
into one large gene expression file (mouse_expression_normalized). In the working directory it will also generate a into one large gene expression file (mouse_expression_normalized). In the working directory it will also generate a
mouse_connectivity folder in which a template from the ABA will be saved as well as some meta data for this file. mouse_connectivity folder in which a template from the ABA will be saved as well as some meta data for this file.
""" """
#Create necessary directories #Create necessary directories
os.mkdir(asset_dir) os.mkdir(asset_dir)
os.mkdir(f'{asset_dir}/human_data') os.mkdir(f'{asset_dir}/human_data')
os.mkdir(f'{asset_dir}/mouse_data')
os.mkdir(f'{asset_dir}/mouse_data/3D_Grid_Data')
#HUMAN DATA #HUMAN DATA
#First the human microarray data is downloaded #First the human microarray data is downloaded
files = abagen.fetch_microarray(donors='all', data_dir=f'{asset_dir}/human_data') files = abagen.fetch_microarray(donors='all', data_dir=f'{asset_dir}/human_data')
donor_list = [9861, 10021, 12876, 14380, 15496, 15697] donor_list = [9861, 10021, 12876, 14380, 15496, 15697]
gene_list = pd.read_csv('Gene_subset_HumHom_Final.csv') gene_list = pd.read_csv('Gene_subset_HumHom_Final.csv')
dict_gene_list = gene_list.to_dict(orient='list')
#Here the human data is processed and saved into arrays #Here the human data is processed (eg. remove probes without entrez id) and saved into arrays
for donor_number in donor_list: for donor_number in donor_list:
donor = pd.read_csv(f'{asset_dir}/human_data/normalized_microarray_donor{donor_number}/MicroarrayExpression.csv', header=None) donor = pd.read_csv(f'{asset_dir}/human_data/microarray/normalized_microarray_donor{donor_number}/MicroarrayExpression.csv', header=None)
probes = pd.read_csv(f'{asset_dir}/human_data/normalized_microarray_donor{donor_number}/Probes.csv') probes = pd.read_csv(f'{asset_dir}/human_data/microarray/normalized_microarray_donor{donor_number}/Probes.csv')
probe_id = probes[['probe_id', 'gene_symbol', 'entrez_id']] probe_id = probes[['probe_id', 'gene_symbol', 'entrez_id']]
donor_probe_id = probe_id.merge(donor, how='inner', left_on=['probe_id'], right_on=[0]) donor_probe_id = probe_id.merge(donor, how='inner', left_on=['probe_id'], right_on=[0])
donor_probe_id = donor_probe_id.drop([0], axis=1) donor_probe_id = donor_probe_id.drop([0], axis=1)
donor_probe_id = donor_probe_id[donor_probe_id['entrez_id'] > 0] donor_probe_id = donor_probe_id[donor_probe_id['entrez_id'] > 0]
donor_clean = donor_probe_id[donor_probe_id.entrez_id.isin(dict_gene_list['Human entrez ID'])] donor_clean = donor_probe_id.reset_index(drop=True)
donor_clean = donor_clean.reset_index(drop=True)
#Save one file with the probe info for easy reading #Save one file with the probe info for easy reading
donor_clean.to_csv(f'{asset_dir}/human_data/normalized_microarray_donor{donor_number}/cleaned_data.csv', index=False) donor_clean.to_csv(f'{asset_dir}/human_data/microarray/normalized_microarray_donor{donor_number}/cleaned_data.csv', index=False)
#Create and save one file in original format. This will replace the original expression file, since this is no longer needed #Create and save one file in original format. This will replace the original expression file, since this is no longer needed
donor_clean.rename(columns={'probe_id':'0'}, inplace=True) donor_clean.rename(columns={'probe_id':'0'}, inplace=True)
donor_clean = donor_clean.drop(['gene_symbol','entrez_id'], axis=1) donor_clean = donor_clean.drop(['gene_symbol','entrez_id'], axis=1)
donor_clean.to_csv(f'{asset_dir}/human_data/normalized_microarray_donor{donor_number}/MicroarrayExpression.csv', index=False) donor_clean.to_csv(f'{asset_dir}/human_data/microarray/normalized_microarray_donor{donor_number}/MicroarrayExpression.csv', index=False)
#Some furter processing of the human data to generate the final expression per region file #Some furter processing of the human data to generate the final expression per region file
files = abagen.fetch_microarray(donors='all', data_dir=f'{asset_dir}/human_data') #Make sure data is properly loaded atlas = abagen.fetch_desikan_killiany()
expression = abagen.get_expression_data(atlas['image'], atlas['info'], donors='all') expression = abagen.get_expression_data(atlas['image'], atlas['info'], donors='all', data_dir=f'{asset_dir}/human_data/')
expression_clean = expression[expression.A4GALT.notnull()] expression_clean = expression[expression.A4GALT.notnull()]
gene_filterforexp = list(gene_list['Human gene name']) gene_filterforexp = list(gene_list['Human gene name'])
expression_filtered = expression_clean[gene_filterforexp] expression_filtered = expression_clean[gene_filterforexp]
expression_filtered.to_csv(f'{asset_dir}/human_data/expression_filtered.csv', header=False, index=False) expression_filtered.to_csv(f'{asset_dir}/human_data/expression_filtered.csv', header=False, index=False)
expression_filtered.to_csv(f'{asset_dir}/human_data/expression_filtered_withinf.csv') expression_filtered.to_csv(f'{asset_dir}/human_data/expression_filtered_withinf.csv')
#MOUSE DATA #MOUSE DATA
#Secondly the mouse data is downloaded using allensdk.api GridDataApi class #Secondly the mouse data is downloaded using allensdk.api GridDataApi class
exp_data = GridDataApi() exp_data = GridDataApi()
for gene_id in gene_list.id: for gene_id in gene_list.id:
os.mkdir(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}') os.mkdir(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}')
exp_data.download_gene_expression_grid_data(section_data_set_id=gene_id, volume_type=GridDataApi.ENERGY, path=f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}') exp_data.download_gene_expression_grid_data(section_data_set_id=gene_id, volume_type=GridDataApi.ENERGY, path=f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}/')
#Create a binary map based on template from ABA (used for data processing) #Create a binary map based on template from ABA (used for data processing)
mcc = MouseConnectivityCache(resolution=100) mcc = MouseConnectivityCache(resolution=100)
template = mcc.get_template_volume() template = mcc.get_template_volume()
template_img = sitk.ReadImage('mouse_connectivity/average_template_100.nrrd') template_img = sitk.ReadImage('mouse_connectivity/average_template_100.nrrd')
template_array = sitk.GetArrayFromImage(template_img) template_array = sitk.GetArrayFromImage(template_img)
template_200um = zoom(template_array, (0.51, 0.51, 0.51)) template_200um = zoom(template_array, (0.51, 0.51, 0.51))
template_200um_bi = template_200um > 1 template_200um_bi = template_200um > 1
template_200um_bi = template_200um_bi.astype(int) template_200um_bi = template_200um_bi.astype(int)
#Processing the mouse expression data and generating a final array #Processing the mouse expression data and generating a final array
for gene_id in gene_list.id: for gene_id in gene_list.id:
gene = sitk.ReadImage(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}/energy.mhd') gene = sitk.ReadImage(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}/energy.mhd')
gene_array = sitk.GetArrayFromImage(gene) gene_array = sitk.GetArrayFromImage(gene)
gene_array_cleaned = gene_array.copy() gene_array_cleaned = gene_array.copy()
gene_array_cleaned[np.where(template_200um_bi == 0)] = np.nan gene_array_cleaned[np.where(template_200um_bi == 0)] = np.nan
gene_vector = gene_array_cleaned.ravel() gene_vector = gene_array_cleaned.ravel()
gene_vector_cleaned = gene_vector[np.logical_not(np.isnan(gene_vector))] gene_vector_cleaned = gene_vector[np.logical_not(np.isnan(gene_vector))]
np.savetxt(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}/gene_vector.csv', gene_vector_cleaned, delimiter=",") np.savetxt(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}/gene_vector.csv', gene_vector_cleaned, delimiter=",")
mouse_expression = np.zeros([78682])[np.newaxis].T mouse_expression = np.zeros([78682])[np.newaxis].T
for gene_id in gene_list.id: for gene_id in gene_list.id:
gene_vector = np.genfromtxt(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}/gene_vector.csv', delimiter=',') gene_vector = np.genfromtxt(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}/gene_vector.csv', delimiter=',')
gene_vector_column = gene_vector[np.newaxis].T gene_vector_column = gene_vector[np.newaxis].T
mouse_expression = np.concatenate((mouse_expression, gene_vector_column), axis = 1) mouse_expression = np.concatenate((mouse_expression, gene_vector_column), axis = 1)
mouse_expression = np.delete(x_variable_list, 0, axis = 1) mouse_expression = np.delete(mouse_expression, 0, axis = 1)
#Normalize mouse gene expression and saving it to csv #Normalize mouse gene expression and saving it to csv
min_max_scaler = MinMaxScaler() min_max_scaler = MinMaxScaler()
mouse_expression_normalized = min_max_scaler.fit_transform(mouse_expression) mouse_expression_normalized = min_max_scaler.fit_transform(mouse_expression)
pd.DataFrame(mouse_expression_normalized).to_csv(f"{asset_dir}/mouse_data/3D_Grid_Data/mouse_expression_normalized.csv", header=None, index=None) pd.DataFrame(mouse_expression_normalized).to_csv(f"{asset_dir}/mouse_data/3D_Grid_Data/mouse_expression_normalized.csv", header=None, index=None)
#Create mouse expression version containing more info (used for human2mouse code) #Create mouse expression version containing more info (used for human2mouse code)
mouse_x = np.zeros([159326])[np.newaxis].T mouse_x = np.zeros([159326])[np.newaxis].T
for gene_id in gene_list.id: for gene_id in gene_list.id:
gene = sitk.ReadImage(f'3D_Grid_Data/{gene_id}/energy.mhd') gene = sitk.ReadImage(f'{asset_dir}/mouse_data/3D_Grid_Data/{gene_id}/energy.mhd')
gene_array = sitk.GetArrayFromImage(gene) gene_array = sitk.GetArrayFromImage(gene)
gene_vector = gene_array.ravel() gene_vector = gene_array.ravel()
gene_vector_column = gene_vector[np.newaxis].T gene_vector_column = gene_vector[np.newaxis].T
mouse_x = np.concatenate((mouse_x, gene_vector_column), axis = 1) mouse_x = np.concatenate((mouse_x, gene_vector_column), axis = 1)
mouse_x = np.delete(mouse_x, 0, axis = 1) mouse_x = np.delete(mouse_x, 0, axis = 1)
pd.DataFrame(mouse_x).to_csv(f"{asset_dir}/mouse_data/3D_Grid_Data/mouse_expression_total.csv", header=None, index=None) pd.DataFrame(mouse_x).to_csv(f"{asset_dir}/mouse_data/3D_Grid_Data/mouse_expression_total.csv", header=None, index=None)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
get_all_data()
```
%% Cell type:code id: tags:
``` python
def mouse2human(mouse_nii, asset_dir='Data_folder', output='to_human'): def mouse2human(mouse_nii, asset_dir='Data_folder', output='to_human'):
"""Convert your mouse map into a human map """Convert your mouse map into a human map
mouse_nii = path to your mouse map mouse_nii = path to your mouse map
asset_dir = directory where the data is saved (provided in get_all_data function, default = 'Data_folder') asset_dir = directory where the data is saved (provided in get_all_data function, default = 'Data_folder')
output = output directory (will return both human_atlas.nii and human_vector.csv, default = 'to_human') output = output directory (will return both human_atlas.nii and human_vector.csv, default = 'to_human')
""" """
#Create necessary directories #Create necessary directories
os.mkdir(output) os.mkdir(output)
#Create a binary map based on template from ABA (used for data processing)
template_img = sitk.ReadImage('mouse_connectivity/average_template_100.nrrd')
template_array = sitk.GetArrayFromImage(template_img)
template_200um = zoom(template_array, (0.51, 0.51, 0.51))
template_200um_bi = template_200um > 1
template_200um_bi = template_200um_bi.astype(int)
#Create the y variable for the model (based on mouse map) #Create the y variable for the model (based on mouse map)
fMRI_template = sitk.ReadImage(mouse_nii) fMRI_template = sitk.ReadImage(mouse_nii)
fMRI_array = sitk.GetArrayFromImage(fMRI_template) fMRI_array = sitk.GetArrayFromImage(fMRI_template)
#fMRI_array = np.transpose(fMRI_array, (2,0,1)) #this piece of code is specific for Jo's mouse map size
#fMRI_array = np.flip(fMRI_array)
#fMRI_array = zoom(fMRI_array, (1.01, 1.02, 1.01))
fMRI_array[np.where(template_200um_bi == 0)] = np.nan fMRI_array[np.where(template_200um_bi == 0)] = np.nan
fMRI_array = fMRI_array.ravel() fMRI_array = fMRI_array.ravel()
fMRI_array = fMRI_array[np.logical_not(np.isnan(fMRI_array))] fMRI_array = fMRI_array[np.logical_not(np.isnan(fMRI_array))]
#normalize the data #normalize the data
min_max_scaler = MinMaxScaler() min_max_scaler = MinMaxScaler()
fMRI_array = fMRI_array.reshape(-1, 1) fMRI_array = fMRI_array.reshape(-1, 1)
fMRI_array_normalized = min_max_scaler.fit_transform(fMRI_array) fMRI_array_normalized = min_max_scaler.fit_transform(fMRI_array)
fMRI_array_normalized = fMRI_array_normalized.ravel() fMRI_array_normalized = fMRI_array_normalized.ravel()
#load the x variable (mouse gene expression) #load the x variable (mouse gene expression)
x_normalized = pd.read_csv(f"{asset_dir}/mouse_data/3D_Grid_Data/mouse_expression_normalized.csv", header=None) x_normalized = pd.read_csv(f"{asset_dir}/mouse_data/3D_Grid_Data/mouse_expression_normalized.csv", header=None)
x_normalized_np = x_normalized.to_numpy() x_normalized_np = x_normalized.to_numpy()
#run the linear model #run the linear model
model = LinearRegression() model = LinearRegression()
model.fit(x_normalized_np, fMRI_array_normalized) model.fit(x_normalized_np, fMRI_array_normalized)
#read in the human gene expression and use as new x variable #read in the human gene expression and use as new x variable
human_gene_vector = pd.read_csv(f'{asset_dir}/human_data/expression_filtered.csv', header=None) human_gene_vector = pd.read_csv(f'{asset_dir}/human_data/expression_filtered.csv', header=None)
human_gene_vector_np = human_gene_vector.to_numpy() human_gene_vector_np = human_gene_vector.to_numpy()
y_human = model.predict(human_gene_vector_np) y_human = model.predict(human_gene_vector_np)
#load human y variable into a brain map #load human y variable into a brain map
atlas = nib.load('Desikan_Killiany_atlas/atlas-desikankilliany.nii.gz') atlas = nib.load('atlas-desikankilliany.nii.gz')
atlas_data = atlas.get_fdata() atlas_data = atlas.get_fdata()
expression_index = pd.read_csv(f'{asset_dir}/human_data/expression_filtered_withinf.csv') #information from columns is needed for next steps expression_index = pd.read_csv(f'{asset_dir}/human_data/expression_filtered_withinf.csv') #information from columns is needed for next steps
human_y_vector = pd.DataFrame(data=y_human) human_y_vector = pd.DataFrame(data=y_human)
human_y_vector['index'] = list(expression_index.label) human_y_vector['index'] = list(expression_index.label)
human_y_vector = human_y_vector.set_index('index') human_y_vector = human_y_vector.set_index('index')
human_y_dict = human_y_vector.to_dict() human_y_dict = human_y_vector.to_dict()
human_y_dict = human_y_dict[0] human_y_dict = human_y_dict[0]
atlas_empty = np.zeros(shape=(197, 233, 189)) atlas_empty = np.zeros(shape=(197, 233, 189))
for i in list(list(human_y_dict.keys())): for i in list(list(human_y_dict.keys())):
atlas_empty = np.where(atlas_data == i, human_y_dict[i], atlas_empty) atlas_empty = np.where(atlas_data == i, human_y_dict[i], atlas_empty)
atlas_empty_img = nib.Nifti1Image(atlas_empty, atlas.affine) atlas_empty_img = nib.Nifti1Image(atlas_empty, atlas.affine)
atlas_empty_vector = atlas_empty.ravel() atlas_empty_vector = atlas_empty.ravel()
#save atlas as nii and vector #save atlas as nii and vector
nib.save(atlas_empty_img, f'{asset_dir}/human_data/{output}/human_atlas.nii') nib.save(atlas_empty_img, f'{output}/human_atlas.nii')
pd.DataFrame(atlas_empty_vector).to_csv(f'{asset_dir}/human_data/{output}/human_vector.csv', header=None, index=None) pd.DataFrame(atlas_empty_vector).to_csv(f'{output}/human_vector.csv', header=None, index=None)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
mouse2human('CAP_NIFTI_ZScored_CSD_v2_CAP3.nii')
```
%% Cell type:code id: tags:
``` python
def human2mouse(human_nii, asset_dir='Data_folder', output='to_mouse'): def human2mouse(human_nii, asset_dir='Data_folder', output='to_mouse'):
"""Convert your human map into a mouse map """Convert your human map into a mouse map
human_nii = path to your human map human_nii = path to your human map
asset_dir = directory where the data is saved (provided in get_all_data function, default = 'Data_folder') asset_dir = directory where the data is saved (provided in get_all_data function, default = 'Data_folder')
output = output directory (will return both mouse_atlas.nii and mouse_vector.csv, default = 'to_mouse') output = output directory (will return both mouse_atlas.nii and mouse_vector.csv, default = 'to_mouse')
""" """
#Create necessary directories #Create necessary directories
os.mkdir(output) os.mkdir(output)
#Human y variable is simply the meants taken from one of Jo's CAPS #Human y variable is simply the meants taken from one of Jo's CAPS
human_y = np.genfromtxt('Meants_files/Human_CAPS/CAP1_meants.txt') human_y = np.genfromtxt(human_nii)
#But regions 3 and 28 are missing from the gene expression data, so delete those (array start from 0, so 2 and 27) #But regions 3 and 28 are missing from the gene expression data, so delete those (array start from 0, so 2 and 27)
human_y = np.delete(human_y, [2,27]) human_y = np.delete(human_y, [2,27])
#load the human x variable #load the human x variable
human_gene_vector = pd.read_csv('microarray/expression_filtered.csv', header=None) human_gene_vector = pd.read_csv(f'{asset_dir}/human_data/expression_filtered.csv', header=None)
human_gene_vector_np = human_gene_vector.to_numpy() human_gene_vector_np = human_gene_vector.to_numpy()
#Run the model #Run the model
model = LinearRegression() model = LinearRegression()
model.fit(human_gene_vector_np, human_y) model.fit(human_gene_vector_np, human_y)
#load the mouse expression to use as x variable #load the mouse expression to use as x variable
mouse_x_variable = pd.read_csv("3D_Grid_Data/mouse_x_total", header=None) mouse_x_variable = pd.read_csv(f"{asset_dir}/mouse_data/3D_Grid_Data/mouse_expression_total.csv", header=None)
mouse_x_variable_np = mouse_x_variable.to_numpy() mouse_x_variable_np = mouse_x_variable.to_numpy()
#Predict y values based on mouse expression and load into a nii file #Predict y values based on mouse expression and load into a nii file
y_mouse = model.predict(mouse_x_variable_np) y_mouse = model.predict(mouse_x_variable_np)
mouse_weigted_brain = np.reshape(y_mouse, (58, 41, 67)) mouse_weigted_brain = np.reshape(y_mouse, (58, 41, 67))
atlas = nib.load(mouse_template) atlas = nib.load('CAP_NIFTI_ZScored_CSD_v2_CAP3.nii')
mouse_weigted_nii = nib.Nifti1Image(mouse_weigted_brain, atlas.affine) mouse_weigted_nii = nib.Nifti1Image(mouse_weigted_brain, atlas.affine)
#save atlas as nii and vector #save atlas as nii and vector
nib.save(mouse_weigted_nii, f'{asset_dir}/mouse_data/{output}/mouse_atlas.nii') nib.save(mouse_weigted_nii, f'{output}/mouse_atlas.nii')
pd.DataFrame(y_mouse).to_csv(f'{asset_dir}/mouse_data/{output}/mouse_vector.csv', header=None, index=None) pd.DataFrame(y_mouse).to_csv(f'{output}/mouse_vector.csv', header=None, index=None)
```
%% Cell type:code id: tags:
``` python
human2mouse('CAP3_meants.txt')
```
%% Output
pixdim[0] (qfac) should be 1 (default) or -1; setting qfac to 1
INFO: pixdim[0] (qfac) should be 1 (default) or -1; setting qfac to 1
%% Cell type:code id: tags:
``` python
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment