#Given four lists of data, ask if the ratios between pairs of datasets are significantly different. #Compare the ratio of A/B to the ratio of C/D #python3 from scipy.stats import ranksums from math import log2 import matplotlib.pyplot as plt import numpy as np import argparse import pandas as pd import seaborn as sns #A and B are paired. C and D are paired. def readdata(fileA, fileB, fileC, fileD): dataA = [] dataB = [] dataC = [] dataD = [] with open(fileA, 'r') as infhA, open(fileB, 'r') as infhB, open(fileC, 'r') as infhC, open(fileD, 'r') as infhD: for line in infhA: line = line.strip() if line: dataA.append(float(line)) for line in infhB: line = line.strip() if line: dataB.append(float(line)) for line in infhC: line = line.strip() if line: dataC.append(float(line)) for line in infhD: line = line.strip() if line: dataD.append(float(line)) return dataA, dataB, dataC, dataD def readdata_xls(xlsfile): df = pd.read_excel(xlsfile, header = 0, index_col = None) a1 = df['A1'].tolist() a2 = df['A2'].tolist() a3 = df['A3'].tolist() b1 = df['B1'].tolist() b2 = df['B2'].tolist() b3 = df['B3'].tolist() c1 = df['C1'].tolist() c2 = df['C2'].tolist() c3 = df['C3'].tolist() d1 = df['D1'].tolist() d2 = df['D2'].tolist() d3 = df['D3'].tolist() a = a1 + a2 + a3 b = b1 + b2 + b3 c = c1 + c2 + c3 d = d1 + d2 + d3 a = [x for x in a if str(x) != 'nan'] b = [x for x in b if str(x) != 'nan'] c = [x for x in c if str(x) != 'nan'] d = [x for x in d if str(x) != 'nan'] return a, b, c, d def getallratios(dataX, dataY): #Given two lists of measurements, get the ratios (log2) of every Y / every X ratios = [] for y in dataY: for x in dataX: try: ratio = log2(y/x) ratios.append(ratio) except ZeroDivisionError: continue return ratios def compareratios(ratios1, ratios2): p = ranksums(ratios1, ratios2)[1] ratios1med = np.median(ratios1) ratios2med = np.median(ratios2) print('The median log2 ratio of A/B is {0}.'.format(round(ratios1med, 3))) print('The median log2 ratio of C/D is {0}.'.format(round(ratios2med, 3))) print('The pvalue that distribution of the two sets of ratios are different is {0}.'.format(p)) def cliffsDelta(lst1, lst2): """Returns cliff's delta for effect size of difference between two lists""" #https://github.com/neilernst/cliffsDelta/blob/master/cliffsDelta.py m, n = len(lst1), len(lst2) lst2 = sorted(lst2) j = more = less = 0 for repeats, x in runs(sorted(lst1)): while j <= (n - 1) and lst2[j] < x: j += 1 more += j*repeats while j <= (n - 1) and lst2[j] == x: j += 1 less += (n - j)*repeats d = (more - less) / (m*n) return d def runs(lst): """Iterator, chunks repeated values""" for j, two in enumerate(lst): if j == 0: one, i = two, 0 if one != two: yield j - i, one i = j one = two yield j - i + 1, two if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--excelfile', type = str) args = parser.parse_args() dataA, dataB, dataC, dataD = readdata_xls(args.excelfile) ratios_ab = getallratios(dataB, dataA) ratios_cd = getallratios(dataD, dataC) d1 = {'value' : ratios_ab, 'ratio' : ['ab'] * len(ratios_ab)} d2 = {'value' : ratios_cd, 'ratio' : ['cd'] * len(ratios_cd)} df1 = pd.DataFrame.from_dict(data = d1) df2 = pd.DataFrame.from_dict(data = d2) df = df1.append(df2, ignore_index = True) #ax = sns.kdeplot(ratios_ab) #ax = sns.kdeplot(ratios_cd) compareratios(ratios_ab, ratios_cd) delta = round(cliffsDelta(ratios_ab, ratios_cd), 2) print('Cliff\'s delta = {0}'.format(delta)) #plt.show()