# script t-test.py ''' Student's t-test for two samples p-value from t-distribution or by resampling of orginal data (bootstap method) .. author:: Guenter Quast fuer den Kurs Rechnernnutzung in der Physik (RNP) ''' # ------------------------------------------------------------- # pyhton2 - python3 compatibility from __future__ import division from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals import numpy as np import matplotlib.pyplot as plt import scipy.stats as stat def calc_Student_t(d1,d2): ''' Calculate test statistic t for Student's t-test input: d1, d2: 1d numpy-arrays containing the data Args: d1, d2: 1d numpy-arrays containing the data Returns: float: value of Student's test float: number of degrees of freedom ''' dm=d2.mean()-d1.mean() n1=len(d1) n2=len(d2) ndf=np.float32(n1+n2-2) s12=np.sqrt( ( (n1-1)*d1.var(ddof=1) + (n2-1)*d2.var(ddof=1) ) / ndf ) t=dm/(s12*np.sqrt(1./n1 + 1./n2) ) return t, ndf # ------------------------------------------------------------- ##### ---- main Program starts here ----- if __name__ == "__main__": ''' illustrate application of calt_t ''' def getData(): # provide data samples d1=np.array([\ 2.7, 2.7, 2.7, 2.8, 2.5, 1.1, 2.9, 2.5, 1.5, 1.0, 2.8, 1.0, 2.2, 2.7,\ 2.4, 3.1, 2.6, 1.3, 1.7, 3.1, 1.0, 2.4, 1.9, 1.7, 2.8, 2.1, 2.6, 2.4,\ 2.5, 2.2, 2.3, 2.0, 2.3, 2.1, 2.1, 3.5, 1.8, 2.0, 2.6, 2.9, 3.1, 1.7,\ 2.7, 2.7, 1.2, 2.6, 2.7, 1.2, 3.1, 1.7, 2.9, 2.9, 2.2, 1.5, 1.9, 2.4,\ 1.7, 2.7, 2.7, 1.7, 2.6, 1.8, 2.1, 1.9, 2.4, 3.0, 3.8, 2.9, 3.6, 1.2,\ 2.5, 2.8, 2.7, 1.7, 2.6, 2.7, 1.0, 3.4, 3.0, 1.3, 1.9, 2.8, 3.1, 1.2,\ 2.3, 2.1, 2.8, 2.5, 2.1, 2.1, 3.4, 2.1, 1.9, 3.3, 1.2, 2.8, 2.1, 3.0,\ 1.5, 3.2, 1.6, 1.9, 2.8, 3.7, 2.9, 2.6, 2.4, 2.9, 1.4, 1.8 ], dtype=np.float32) d2=np.array([\ 3.2, 3.1, 2.7, 1.8, 1.9, 2.1, 3.3, 1.8, 2.0, 2.8, 3.0, 3.4, 1.1, 2.5,\ 2.1, 3.1, 2.6, 2.1], dtype=np.float32) return d1,d2 # d1, d2 = getData() # # calculate Student's t t, ndf=calc_Student_t(d1,d2) # determine p-value from t distribution # p-value: test on equality d1_mean = d2_mean pval = 2* stat.t.cdf(-np.abs(t), ndf) # two-sided limit # p-value: test on d1_mean !> d2_mean pval1 = stat.t.cdf(t, ndf) # one-sided limit, from left # p-value: test on d2_mean !> d1_mean pval2 = 1. - stat.t.cdf(t, ndf) # one-sided limit, from right # # p-value using Bootstrap method, i.e. resampling form data itself, d = np.concatenate( (d1, d2) ) # use as population distribution niter=10000 # draw samples n1=len(d1) n2=len(d2) g = [] for i in range(niter): ti, _ = calc_Student_t(np.random.choice(d, size=n1), np.random.choice(d, size=n2) ) g.append(abs(ti)) g = np.array(g) pval_bs = float(np.sum(g > abs(t))) / niter # count smples t_i > t_obs # print result print( '*==* t=%.3f for %i degrees of freedom'%(t,ndf) ) print( '*==* p-val equality %.3f'%(pval) ) print( '*==* pval mean_1 !> mean_2: %.3f'%(pval1) ) print( '*==* pval mean_2 !> mean_1: %.3f'%(pval2) ) print( '' ) print( '*==* pval from bootstrap: %.3f'%(pval_bs) ) # show input data and p-valu(es) fig=plt.figure('t-test', figsize=(10.,10.)) fig.subplots_adjust(left=0.14, bottom=0.1, right=0.97, top=0.93, wspace=None, hspace=.25)# ax1=fig.add_subplot(2, 2, 1) ax2=fig.add_subplot(2, 2, 2) ax3=fig.add_subplot(2, 2, 4) # histogram data nbins=50 bins=np.linspace( np.amin([np.amin(d1),np.amin(d2)]), np.amax([np.amax(d1),np.amax(d2)]), 30) ax1.hist(d1,bins, alpha=0.5, label='sample 1') ax1.hist(d2,bins, alpha=0.5, label='sample 2') ax1.set_ylabel('freqeuncy') ax1.set_xlabel('mark') ax1.legend(loc='best') # plot t-distribution and t-value for test on equality tvals=np.linspace(0., 4.5, 100) ax2.plot(tvals, stat.t.pdf(tvals, ndf), lw=2, color='steelblue', label='t-dist(n$_f\,=\,$%i)'%ndf) ax2.axvline(x=t, color='g', lw=1, linestyle='--', label='t observed in data' ) ax2.arrow(t, 0.15, 0., -0.13, head_length=0.01, head_width=0.1, lw=2, ec='r', fc='r') ax2.set_title('t-distribution') ax2.set_xlabel('|t|') ax2.text(0.5, 0.75, 'p-value = %.1f%%'%(pval*100), transform=ax2.transAxes ) ax2.legend(loc='best') # plot resampled distribution bc, be, _ = ax3.hist(g, bins=100, alpha=0.5, color='blue', density=True, label='t from resampled distribution' ) ax3.set_title('bootstrap method') ax3.set_xlabel('|t|') ax3.axvline(x=t, color='g', lw=1, linestyle='--', label='t observed in data' ) ax3.arrow(t, 0.3, 0, -0.25, head_length=0.02, head_width=0.1, lw=2, ec='r', fc='r') ax3.text(0.5, 0.75, 'p-value = %.1f%%'%(pval_bs*100), transform=ax3.transAxes ) ax3.plot(tvals, 2. * stat.t.pdf(tvals, ndf), lw=2, color='steelblue', label='t-dist(n$_f \,=\,$%i)'%ndf) ax3.legend(loc='best') plt.show()