The PRS obtained from using Plink on the micro-array sequenced data was converted to percentiles based on our training data, the UK Biobank. As noticed before, the scores fall in a Gaussian distribution and this further helps us understand that most people are going to be at medium risk while only a small portion of people would have a high or low risk of contracting the disease.
This data was appended to a CSV file called GastroATL.csv that showed the sample date, name, PRS, and percentile.
prs_interpretation.py
#!/usr/bin/env python3import sysimport osimport csvimport matplotlib.pyplot as pltimport numpy as npfrom scipy.stats import percentileofscore# Function to read polygenic risk scores from a tab-delimited file, skipping headerdefread_polygenic_scores(file_path,score_column): scores = []withopen(file_path, 'r')as file:# Skip the header linenext(file)for line in file: columns = line.strip().split('\t')iflen(columns)> score_column: score =float(columns[score_column]) scores.append(score)return scores# Function to create a distribution plot of polygenic risk scoresdefplot_polygenic_scores_distribution(scores): plt.hist(scores, bins=20, edgecolor='black', alpha=0.7) plt.title('Polygenic Risk Scores Distribution') plt.xlabel('Polygenic Risk Scores') plt.ylabel('Frequency')#plt.savefig('polygenic_scores_distribution.png') # Save the plot as an image#plt.show()# Function to find the percentile of a given score in the distributiondeffind_percentile(score,distribution):returnpercentileofscore(distribution, score)# Function to append data to the CSV filedefappend_to_csv(file_path,data):withopen(file_path, 'a', newline='')as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(data)defmain():# Assign file paths ukb_file_path ='/home/vsrinivasan75/ukb_prs/using_plink/PGS000785/ukb_chr1-22_v1_nomean_PGS000785.sscore' tempus_file_path = sys.argv[1]iflen(sys.argv)==2elseNone# Check if the second file path is providedif tempus_file_path isNone:print("Usage: python script.py tempus_file_path") sys.exit(1)# Read polygenic risk scores from the UK Biobank file (5th column) polygenic_scores_ukb =read_polygenic_scores(ukb_file_path, score_column=4)# Plot the distribution of polygenic risk scores for the UK Biobank fileplot_polygenic_scores_distribution(polygenic_scores_ukb)# Read polygenic risk scores from the Tempus file (4th column) polygenic_scores_tempus =read_polygenic_scores(tempus_file_path, score_column=3)# Example: Find the percentile of the first score from the Tempus fileif polygenic_scores_tempus:# Check if the list is not empty specific_score = polygenic_scores_tempus[0] percentile =find_percentile(specific_score, polygenic_scores_ukb) rounded_percentile =round(percentile, 2)print(f'The percentile of the score {specific_score} in the UKB distribution is: {rounded_percentile}%')# Extract required information from tempus_file_path tempus_file_parts = tempus_file_path.split('/') date = tempus_file_parts[-2].split('_')[0] # Extract the part before the underscoreid= tempus_file_parts[-1].split('.')[0] # Extract until the first occurrence of "."# Check for duplicates before appending to the CSV file data_to_append = [date,id, specific_score, rounded_percentile] csv_file_path ='GastroATL_PRS.csv'withopen(csv_file_path, 'r')as csv_file: csv_reader = csv.reader(csv_file)for row in csv_reader:if row[:3]== data_to_append[:3]:# Check if the first three columns matchprint("Duplicate entry. Skipping...") sys.exit(0)# Append the data to the CSV fileappend_to_csv(csv_file_path, data_to_append)print("Data appended to GastroATL_PRS.csv")else:print("The Tempus file does not contain any polygenic risk scores.")if__name__=="__main__":main()
The IDs have not been shown in the following results to maintain confidentiality.