# script t-test.py
''' 
   Student's t-test for two samples

   p-value from t-distribution 
   or by resampling of orginal data (bootstap method)

.. author:: Guenter Quast <g.quast@kit.edu> 
     fuer den Kurs Rechnernnutzung in der Physik (RNP)
'''
# -------------------------------------------------------------

# pyhton2 - python3 compatibility
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stat

def calc_Student_t(d1,d2):
  '''
    Calculate test statistic t for Student's t-test
    input: d1, d2: 1d numpy-arrays containing the data
    Args: 
     d1, d2: 1d numpy-arrays containing the data
   
    Returns:
     float: value of Student's test
     float: number of degrees of freedom
  '''
  dm=d2.mean()-d1.mean()
  n1=len(d1)
  n2=len(d2)
  ndf=np.float32(n1+n2-2)
  s12=np.sqrt( ( (n1-1)*d1.var(ddof=1) + (n2-1)*d2.var(ddof=1) ) / ndf ) 
  t=dm/(s12*np.sqrt(1./n1 + 1./n2) )
  return t, ndf

# -------------------------------------------------------------

##### ---- main Program starts here -----
if __name__ == "__main__":
  '''
    illustrate application of calt_t
  '''

  def getData():
  # provide data samples
    d1=np.array([\
     2.7, 2.7, 2.7, 2.8, 2.5, 1.1, 2.9, 2.5, 1.5, 1.0, 2.8, 1.0, 2.2, 2.7,\
     2.4, 3.1, 2.6, 1.3, 1.7, 3.1, 1.0, 2.4, 1.9, 1.7, 2.8, 2.1, 2.6, 2.4,\
     2.5, 2.2, 2.3, 2.0, 2.3, 2.1, 2.1, 3.5, 1.8, 2.0, 2.6, 2.9, 3.1, 1.7,\
     2.7, 2.7, 1.2, 2.6, 2.7, 1.2, 3.1, 1.7, 2.9, 2.9, 2.2, 1.5, 1.9, 2.4,\
     1.7, 2.7, 2.7, 1.7, 2.6, 1.8, 2.1, 1.9, 2.4, 3.0, 3.8, 2.9, 3.6, 1.2,\
     2.5, 2.8, 2.7, 1.7, 2.6, 2.7, 1.0, 3.4, 3.0, 1.3, 1.9, 2.8, 3.1, 1.2,\
     2.3, 2.1, 2.8, 2.5, 2.1, 2.1, 3.4, 2.1, 1.9, 3.3, 1.2, 2.8, 2.1, 3.0,\
     1.5, 3.2, 1.6, 1.9, 2.8, 3.7, 2.9, 2.6, 2.4, 2.9, 1.4, 1.8 ],
     dtype=np.float32)
    d2=np.array([\
     3.2, 3.1, 2.7, 1.8, 1.9, 2.1, 3.3, 1.8, 2.0, 2.8, 3.0, 3.4, 1.1, 2.5,\
     2.1, 3.1, 2.6, 2.1],
     dtype=np.float32)
    return d1,d2  
#
  d1, d2 = getData()
#  
# calculate Student's t
  t, ndf=calc_Student_t(d1,d2)

# determine p-value from t distribution
#   p-value: test on equality d1_mean = d2_mean
  pval = 2* stat.t.cdf(-np.abs(t), ndf) # two-sided limit
#   p-value: test on d1_mean !> d2_mean
  pval1 = stat.t.cdf(t, ndf) # one-sided limit, from left
#   p-value: test on d2_mean !> d1_mean
  pval2 = 1. - stat.t.cdf(t, ndf) # one-sided limit, from right

# 
# p-value using Bootstrap method, i.e. resampling form data itself,
  d = np.concatenate( (d1, d2) ) # use as population distribution
  niter=10000   # draw <niter> samples
  n1=len(d1)
  n2=len(d2)
  g =  []
  for i in range(niter):
    ti, _ = calc_Student_t(np.random.choice(d, size=n1), 
                           np.random.choice(d, size=n2) )
    g.append(abs(ti))
  g = np.array(g)
  pval_bs = float(np.sum(g > abs(t))) / niter  # count smples t_i > t_obs

# print result 
  print( '*==* t=%.3f for %i degrees of freedom'%(t,ndf) )
  print( '*==* p-val equality %.3f'%(pval) )
  print( '*==* pval mean_1 !> mean_2: %.3f'%(pval1) )
  print( '*==* pval mean_2 !> mean_1: %.3f'%(pval2) )
  print( '' )
  print( '*==* pval from bootstrap: %.3f'%(pval_bs) )


# show input data and p-valu(es)
  fig=plt.figure('t-test', figsize=(10.,10.))
  fig.subplots_adjust(left=0.14, bottom=0.1, right=0.97, top=0.93,
                    wspace=None, hspace=.25)#

  ax1=fig.add_subplot(2, 2, 1)
  ax2=fig.add_subplot(2, 2, 2)
  ax3=fig.add_subplot(2, 2, 4)
# histogram data
  nbins=50
  bins=np.linspace( np.amin([np.amin(d1),np.amin(d2)]),
                    np.amax([np.amax(d1),np.amax(d2)]), 30)
  ax1.hist(d1,bins, alpha=0.5, label='sample 1')
  ax1.hist(d2,bins, alpha=0.5, label='sample 2')
  ax1.set_ylabel('freqeuncy')
  ax1.set_xlabel('mark')
  ax1.legend(loc='best')
# plot t-distribution and t-value for test on equality
  tvals=np.linspace(0., 4.5, 100)
  ax2.plot(tvals, stat.t.pdf(tvals, ndf), 
           lw=2, color='steelblue', label='t-dist(n$_f\,=\,$%i)'%ndf)
  ax2.axvline(x=t, color='g', lw=1, linestyle='--', label='t observed in data' )
  ax2.arrow(t, 0.15, 0., -0.13, head_length=0.01, head_width=0.1, 
            lw=2, ec='r', fc='r')
  ax2.set_title('t-distribution')
  ax2.set_xlabel('|t|')
  ax2.text(0.5, 0.75,  'p-value =  %.1f%%'%(pval*100),
     transform=ax2.transAxes )
  ax2.legend(loc='best')

# plot resampled distribution
  bc, be, _ = ax3.hist(g, bins=100, alpha=0.5, color='blue', density=True,
                label='t from resampled distribution' )
  ax3.set_title('bootstrap method')
  ax3.set_xlabel('|t|')
  ax3.axvline(x=t, color='g', lw=1, linestyle='--', label='t observed in data' )
  ax3.arrow(t, 0.3, 0,  -0.25, head_length=0.02, head_width=0.1, 
            lw=2, ec='r', fc='r')
  ax3.text(0.5, 0.75,  'p-value =  %.1f%%'%(pval_bs*100),
     transform=ax3.transAxes )
  ax3.plot(tvals, 2. * stat.t.pdf(tvals, ndf), 
           lw=2, color='steelblue', label='t-dist(n$_f \,=\,$%i)'%ndf)
  ax3.legend(loc='best')
  plt.show()