0

I have this code, to convert .VCF files to .GENO files. To test it, I just used the smallest file on my laptop, but for all, I need a bigger machine, and it would be really nice, if it wouldn't take weeks to run. It works perfectly on my laptop (I7 4th gen), but really slow on our workstation server (have 48 cores, but slower). How can I modify the code, to use more cores? Thank you in advance. The code:

import allel
import pandas as pd
import numpy as np
from time import process_time
import numba as nb
@nb.jit(forceobj=True)
def create_chrpos(data, n):
    chr_pos = []
    chr_pos = np.array(chr_pos, dtype=np.int32)
    for i in range(len(data)):
        if data['chr'][i] == n:
            if i == 0:
                chr_pos = data['pos'][0]
            else:
                a = data['pos'][i]
                chr_pos = np.append(chr_pos, [a])

    return chr_pos
@nb.njit
def create_needed_pos(chr_pos, pos):
    needed_pos = nb.typed.List.empty_list(nb.int32)
    for i in range(len(chr_pos)):
        for k in range(len(pos)):
            if chr_pos[i] == pos[k]:
                if i == k == 1:
                    needed_pos = nb.typed.List([pos[k]])
                else:
                    needed_pos.append(pos[k])
    return needed_pos
@nb.njit
def create_needed_index(chr_pos, pos):
    needed_index = nb.typed.List.empty_list(nb.int32)
    for i in range(len(chr_pos)):
        for k in range(len(pos)):
            if chr_pos[i] == pos[k]:
                if i == k == 1:
                    needed_index = nb.typed.List([pos[k]])
                else:
                    needed_index.append(pos[k])
    return needed_index
@nb.njit
def create_mat(geno):
    # create matrix as np.uint8 (1 byte) instead of list of python integers (8 byte)
    # also no need to dynamically resize / increase list size
    geno_mat = np.zeros((len(geno[:, 0]), len(geno[1, :])), dtype=np.uint8)

    for i in np.arange(len(geno[:, 0])):
        for k in np.arange(len(geno[1, :])):
            g = geno[i, k]

            # nested ifs to avoid duplicate comparisons
            if g[0] == 0:
                if g[1] == 0:
                    geno_mat[i, k] = 2
                elif g[1] == 1:
                    geno_mat[i, k] = 1
                else:
                    geno_mat[i, k] = 9
            elif g[0] == 1:
                if g[1] == 0:
                    geno_mat[i, k] = 1
                elif g[1] == 1:
                    geno_mat[i, k] = 0
                else:
                    geno_mat[i, k] = 9
            else:
                geno_mat[i, k] = 9
    return geno_mat
def genotyping(geno, pos, chr_pos):
    needed_pos = create_needed_pos(chr_pos, pos)
    create_needed_index(chr_pos, pos)
    mat = create_mat(geno)
    list_difference = [item for item in chr_pos if item not in needed_pos]
    needed_pos_list = list(needed_pos)
    matrix_df = pd.DataFrame(mat, dtype=int, index=pos)
    filtered_geno_dataframe = matrix_df.loc[needed_pos_list, :]
    missing_positions_df = pd.DataFrame(index=list_difference, columns=np.arange(2054))
    missing_positions_df.fillna(2, inplace=True)
    finaldataframe = pd.concat([filtered_geno_dataframe, missing_positions_df])
    finaldataframe.sort_index(axis=0, inplace=True)
    final_mat = finaldataframe.to_numpy(dtype=np.int32)
    return final_mat
def write_first_chr(genotype):
    with open('test_1.geno', 'wb') as fout:  # Note 'wb' instead of 'w'
        np.savetxt(fout, genotype, delimiter="", fmt='%d')
        fout.seek(-2, 2)
        fout.truncate()
def write_remaining_chr(genotype):
    with open('test_1.geno', 'a') as fout:  # Note 'wb' instead of 'w'
        np.savetxt(fout, genotype, delimiter="", fmt='%d')
        fout.seek(-2, 2)
        fout.truncate()
if __name__ == "__main__":
    t1_start = process_time()
    data = pd.read_csv('REICH_1KG.snp', delimiter=r"\s+")
    data.columns = ['ID', "chr", "pyspos", "pos", "Ref", "Alt"]
    samples = open("sample_list_test.txt")
    for i, line in enumerate(samples):
        strip_line = line.strip()
        n = i + 1
        chr_pos = create_chrpos(data, n)
        geno = allel.read_vcf(strip_line, fields=("calldata/GT",))["calldata/GT"]
        pos = allel.read_vcf(strip_line, fields=("variants/POS",))["variants/POS"]
        genotype = genotyping(geno, pos, chr_pos)
        if i + 1 == 1:
            print("First chromosome done")
            write_first_chr(genotype)
        else:
            write_remaining_chr(genotype)
            print("Done:Chr number:", n)
    print("Finished genotyping")
    t1_stop = process_time()
    print("Ennyi idő kellett teszt1:", t1_stop - t1_start)
  • 3
    You might be interested in (and saddened by) [the GIL](https://stackoverflow.com/a/1294402/453616). – gspr Apr 12 '21 at 13:40
  • 1
    Parallelising Python code across multiple threads unfortunately requires explicit multiprocessing (because of the GIL). Check out https://stackoverflow.com/a/1704501/1968 for a basic solution. — Alternatively, push the parallelisation down to C code. – Konrad Rudolph Apr 12 '21 at 13:43
  • 1
    Where do you spend most time? Do you have a large number of samples? One option is to keep python code single-threaded and split samples in several groups, run the script on each subset, then merge the resulting csv files – Nielk Apr 12 '21 at 13:44
  • Thanks Nielk, basically it's a good idea. Nearly all of the steps are slow, that is my problem. These are mostly big files (200Mb-1,5Gb). I think the create_mat and the create_needed_pos functions can be work with batches. (and just realised that create_needed_index is useless...) – Emil Nyerki Apr 12 '21 at 16:50
  • I have tried split the database. The original creat_mat numba function takes 18,31sec, if i split it 10 parts it takes 25,8 sec, 100 parts 80,75, 1000 parts 601 seconds. Used this code: temp_array = np.array_split(geno, 1000) for k in range(len(temp_array)): if k==0: final_mat_2=create_mat(temp_array[k]) else: temp=create_mat(temp_array[k]) final_mat_2=np.concatenate((final_mat_2,temp)) – Emil Nyerki Apr 13 '21 at 11:13

0 Answers0