import sys
import os
import requests
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
#import pandasql as ps
import sqlite3
#from pandasql import ps
from sklearn.decomposition import PCA #, ProbabilisticPCA
from sklearn.impute import KNNImputer, SimpleImputer
import plotly
import plotly.express as px
#from plotly.offline import init_notebook_mode
#init_notebook_mode(connected = True)

#import seaborn as sns
#import vaex as vx 
import matplotlib.colors as pltcolors
import copy
import subprocess

import urllib.request
#urllib.request.urlretrieve(api_url, outfile)

import csv
from io import StringIO


import pandas as pd
from qqman import qqman
#wand.display.display(image, server_name=':0')
import math
import hashlib
#from Exceptions import AttributeError, TypeError

try:
	from IPython.display import Image, FileLink, FileLinks
	from wand.image import Image as WImage
	from wand.display import display as wdisplay
except Exception as ex:
	print(str(ex))
	pass

#px = 1/plt.rcParams['figure.dpi']


TYPE_ID='1 ID'
TYPE_PROP='2 PROP'
TYPE_EXP='4 EXP'
TYPE_PHEN='3 PHEN'
TYPE_SNP='5 SNP'
MESSAGE_INFO=3	# show always
MESSAGE_DEBUG=2
MESSAGE_ERROR=1	# show only error

# in  assigning level for message
#MESSAGE_INFO  always show
#MESSAGE_DEBUG show if MESSAGE_DEBUG
#MESSAGE_ERROR always show


var2val=dict()
var2val['SHOW_ALL']=3
var2val['SHOW_DEBUG']=2
var2val['SHOW_ERRORONLY']=1
var2val['TYPE_EXP']=TYPE_EXP
var2val['TYPE_PHEN']=TYPE_PHEN
var2val['TYPE_PROP']=TYPE_PROP
var2val['TYPE_ID']=TYPE_ID

downloadfiles=set()

#settings
'''
keep_unit=None
plink_path=None
r_path=None
LOG_LEVEL=None
'''

unit_converter=dict()
unit_converter[('percent_of_dw','ug_per_gdw')]=10000.0
unit_converter[('ug_per_gdw','percent_of_dw')]=1.0/10000.0
plink_path="plink"
r_path="Rscript"
LOG_LEVEL=var2val['SHOW_ALL']
keep_unit=None #'percent_of_dw'

# for reload
keep_unit='percent_of_dw'
LOG_LEVEL=var2val['SHOW_ERRORONLY']
imgsize_large=(800,800)
imgsize_xlarge=(2000,2000)
imgsize_xlarge_in=(20,20)
imgsize_small=(400,400)
#LOG_LEVEL=var2val['SHOW_DEBUG']



#LOG_LEVEL=var2val['SHOW_ERRORONLY']
#unit_converter=None


def is_notebook() -> bool:
	try:
		shell = get_ipython().__class__.__name__
		if shell == 'ZMQInteractiveShell':
			return True   # Jupyter notebook or qtconsole
		elif shell == 'TerminalInteractiveShell':
			return False  # Terminal running IPython
		else:
			return False  # Other type (?)
	except NameError:
		return False      # Probably standard Python interpreter

if is_notebook:
	from IPython.display import display, Image

def init_settings():
	global keep_unit,LOG_LEVEL,unit_converter
	LOG_LEVEL=var2val['SHOW_ERRORONLY']
	keep_unit=None
	unit_converter=dict()
	unit_converter[('percent_of_dw','ug_per_gdw')]=10000.0
	unit_converter[('ug_per_gdw','percent_of_dw')]=1.0/10000.0
	plink_path="plink"
	r_path="Rscript"

def myhash(mystring):
	#return str(int(hashlib.sha512(mystring.encode('utf-8')).hexdigest(), 16))
	return str(int(hashlib.sha512(mystring.encode('utf-8')).hexdigest(), 16))


def ossystem(cmd):
	if ">" in cmd:
		printdisplay(MESSAGE_DEBUG," output rediretion in command: " + cmd)
		display(cmd)
		return os.system(cmd)
	if cmd.startswith("rm "):
		return osrm(cmd.replace("rm ",""))
	if cmd.startswith("mv "):
		return osmv(cmd.replace("mv ",""))
	if cmd.startswith("cp "):
		return oscp(cmd.replace("cp ",""))
	if LOG_LEVEL>2 or LOG_LEVEL==var2val['SHOW_DEBUG']:
		display(cmd)
		os.system(cmd)
	else:
		if 'plink' in cmd.split()[0]:
			cmd=cmd+ ' --noweb'
		printdisplay(MESSAGE_DEBUG,cmd + " > /dev/null")
		os.system(cmd + " > /dev/null")

def osrm(rmfile):
	if not os.path.exists(rmfile):
		if "*" in rmfile and len(rmfile.strip())>1:
			os.system("rm " + rmfile)	
		return
	if LOG_LEVEL>2  or LOG_LEVEL==var2val['SHOW_DEBUG']:
		os.system("rm " + rmfile)
	else:
		os.system("rm " + rmfile + " &> /dev/null")
	
def osmv(mvfile):
	if not os.path.exists(mvfile.split()[0]):
		return
	if LOG_LEVEL>2 or  LOG_LEVEL==var2val['SHOW_DEBUG']:
		os.system("mv " + mvfile)
	else:
		os.system("mv " + mvfile + " &> /dev/null")

def oscp(mvfile):
	if not os.path.exists(mvfile.split()[0]):
		return
	if LOG_LEVEL>2 or  LOG_LEVEL==var2val['SHOW_DEBUG']:
		os.system("cp " + mvfile)
	else:
		os.system("cp " + mvfile + " &> /dev/null")



def pltshow(show_level,myplot,name=''):
	if isinstance(myplot, str):
		#printdisplay(show_level, WImage(filename=myplot))
		flink=FileLink(myplot, result_html_prefix="Click here to download: ")
		display(flink)
		if os.path.exists(myplot):
			printdisplay(show_level, Image(filename=myplot, width=imgsize_large[0], height=imgsize_large[1]),name)
			downloadfiles.add(myplot)
		else:
			printdisplay(MESSAGE_INFO,'file dont exists ' + myplot)
	elif LOG_LEVEL>2 or (show_level==MESSAGE_DEBUG and LOG_LEVEL==var2val['SHOW_DEBUG']) or MESSAGE_INFO==show_level:
		if name:
			print(name)
		
		#try:
			#plt.figure(figsize=(imgsize_large[0] , imgsize_large[1]))
		#except:
		#	pass
		myplot.show()


def printdisplay(show_level, obj,name=''):
	if  LOG_LEVEL>1 or  (show_level==MESSAGE_DEBUG and LOG_LEVEL==var2val['SHOW_DEBUG']) or MESSAGE_INFO==show_level:
		if name:
			print(name)
		if is_notebook():
			if isinstance(obj,str) and os.path.exists(str(obj)):
				flink=FileLink(str(obj), result_html_prefix="Click here to download: ")
				display(flink)
				downloadfiles.add(obj)
				obj=pd.read_csv(str(obj),sep="\t")
				display(obj)
			elif isinstance(obj,str):
				strfile=str(obj).split()[0].strip()
				if strfile.lower()=='generated' or strfile.lower()=='loaded':
					strfile=str(obj).split()[1].strip()
					if os.path.exists(strfile):
						flink=FileLink(strfile, result_html_prefix="Click here to download: ")
						display(flink)
						downloadfiles.add(strfile)
					else:
						display('File ' + strfile + ' dont exists')
				display(str(obj))
			elif isinstance(obj,WImage):
				wdisplay(obj)
			elif isinstance(obj,Image):
				display(obj)
			else:
				if isinstance(obj, pd.DataFrame):
					print(obj.columns.values.tolist())
				display(obj)
				#print(obj)
		else:
			#print(obj)
			if isinstance(obj, pd.DataFrame):
				print(obj.columns.values.tolist())
			print(obj)

def getSetting0(varname):
	return var2val[varname]


def setVariables0(my_keep_unit=None, my_plink_path=None,my_loglevel=None,my_unit_converter=None,my_r_path=None, my_imgsize_large=None, my_imgsize_xlarge=None, my_imgsize_xlarge_in=None, my_imgsize_small=None):
	global keep_unit, plink_path, LOG_LEVEL, unit_converter, r_path, imgsize_large, imgsize_xlarge, imgsize_xlarge_in, imgsize_small
	if my_keep_unit:
		keep_unit=my_keep_unit
	if my_plink_path:
		plink_path=my_plink_path
	if my_r_path:
		r_path=my_r_path
	if my_loglevel:
		LOG_LEVEL=my_loglevel
	if my_imgsize_large:
		imgsize_large=my_imgsize_large
	if my_imgsize_xlarge:
		imgsize_xlarge=my_imgsize_xlarge
	if my_imgsize_xlarge_in:
		imgsize_xlarge_in=my_imgsize_xlarge_in
	if my_imgsize_small:
		imgsize_small=my_imgsize_small


	if my_unit_converter:
		for i in my_unit_converter:
			unit_converter[(i[0],i[1])]=my_unit_converter[i]
			unit_converter[(i[1],i[0])]=1.0/my_unit_converter[i]

	printdisplay(MESSAGE_INFO,"set variables: \nkeep_unit=" + str(keep_unit) + "\nr_path=" + str(r_path) + "\nplink_path=" + str(plink_path) + "\nLOG_LEVEL=" + str(LOG_LEVEL) +"\nunit_converter=" + str(unit_converter))

def pdf2png(filename,resolution=100,display=True):
	try:
		img = WImage(filename=filename, resolution=resolution) # bigger
		img.format="png"  
		img.save(filename=filename.replace(".pdf",".png"))
		if display:
			#printdisplay(MESSAGE_DEBUG,'python -m wand.display ' + filename.replace(".pdf",".png"))
			#ossystem('python -m wand.display ' + filename.replace(".pdf",".png"))
			wdisplay(img)

	except Exception as ex:
		printdisplay(MESSAGE_DEBUG,'ERROR in pdf2png ' +filename)
		printdisplay(MESSAGE_DEBUG,str(ex))

def float_input(x):
	try:
		return float(x)
	except:
		#printdisplay(MESSAGE_DEBUG,'invalid ' + str(x))
		return float('nan')
	return ""


def floatarray_input(x):
	try:
		l=list(map(float,x.replace("[","").replace("]","").replace('"',"").split(",")))
		if len(l)==1:
			return l[0]
		return sum(l)/len(l)
	except:
		#printdisplay(MESSAGE_DEBUG,'invalid ' + str(x))
		#return float('nan')
		return x
	return ""



def convert_unit(convert_from, from_value):
	if pd.isna(from_value):
		return from_value
	if keep_unit==convert_from:
		return str(from_value)  + '*' + keep_unit
	return str( from_value*unit_converter[(convert_from,keep_unit)]) + '*' + keep_unit


def convert_unit_values(convert_from, from_value):
	if pd.isna(from_value):
		return from_value
	if keep_unit==convert_from:
		return from_value
	return from_value*unit_converter[(convert_from,keep_unit)]




def float_mean(strmeanval):
	if pd.isna(strmeanval):
		return x
	return float(strmeanval.split('±')[0])

'''
def float_meanconvert(strmeanval):
	if pd.isna(strmeanval):
		return x
	valsunits=x.split('*')
	valsunits[0].replace("[","").replace("]","").replace('"',"").split(",")
	return float(strmeanval.split('±')[0])
'''
def floatarray_input(x):
	if pd.isna(x):
		return x
	try:
		convertdmeansunits=[]
		for ivalsunits in x.split(","):
			valsunits=ivalsunits.split('*')
			l=list(map(float_mean,valsunits[0].replace("[","").replace("]","").replace('"',"").split(",")))
			#l=list(map(float_meanconvert,x))

			#print(x)
			#print(valsunits)
			#print(1)
			meanval=l[0]
			if len(l)>1:
				meanval=sum(l)/len(l)

			#print(meanval)

			if len(valsunits)>1:
				convertdmeansunits.append(convert_unit(valsunits[1], meanval))
			else:
				convertdmeansunits.append(str(meanval))

		return str( floatarray_input_values(",".join(convertdmeansunits).replace("]*" + keep_unit + ",[", ",",) )) + "*" + keep_unit
		#return ",".join(convertdmeansunits)

	except Exception as e:
		if pd.notna(x):
			printdisplay(MESSAGE_DEBUG,'floatarray_input invalid ' + str(x))
			printdisplay(MESSAGE_DEBUG,e)
			raise e
		else:
			printdisplay(MESSAGE_DEBUG,'floatarray_input invalid ' + str(x))
		#return float('nan')
		return x
	return x

def floatarray_input_values(x):
	if pd.isna(x):
		return x
	try:
		convertdmeans=[]
		for ivalsunits in x.split(","):
			valsunits=ivalsunits.split('*')
			l=list(map(float_mean,valsunits[0].replace("[","").replace("]","").replace('"',"").split(",")))
			#l=list(map(float_meanconvert,x))
			meanval=l[0]
			if len(l)>1:
				meanval=sum(l)/len(l)

			if len(valsunits)>1:
				convertdmeans.append(convert_unit_values(valsunits[1], meanval))
			else:
				convertdmeans.append(meanval)
		return sum(convertdmeans)/len(convertdmeans)

	except Exception as e:
		if pd.notna(x):
			printdisplay(MESSAGE_DEBUG,'floatarray_input_values invalid ' + str(x))
			printdisplay(MESSAGE_DEBUG,e)
			raise e
		#return float('nan')
		else:
			printdisplay(MESSAGE_DEBUG,'floatarray_input_values invalid ' + str(x))
		return x
	return x



def floatarray_input2(x):
	if pd.isna(x):
		return x
	try:
		valsunits=x.split('*')
		l=list(map(float_mean,valsunits[0].replace("[","").replace("]","").replace('"',"").split(",")))
		#l=list(map(float_meanconvert,x))

		print(x)
		print(valsunits)
		print(1)
		meanval=l[0]
		if len(l)>1:
			meanval=sum(l)/len(l)

		print(meanval)

		if len(valsunits)>1:
			return convert_unit(valsunits[1], meanval)
		return str(meanval)

	except Exception as e:
		if pd.notna(x):
			printdisplay(MESSAGE_DEBUG,'floatarray_input invalid ' + str(x))
			printdisplay(MESSAGE_DEBUG,e)
			raise e
		else:
			printdisplay(MESSAGE_DEBUG,'floatarray_input invalid ' + str(x))
		#return float('nan')
		return x
	return x

def floatarray_input_values2(x):
	if pd.isna(x):
		return x
	try:
		valsunits=x.split('*')
		l=list(map(float_mean,valsunits[0].replace("[","").replace("]","").replace('"',"").split(",")))
		#l=list(map(float_meanconvert,x))
		meanval=l[0]
		if len(l)>1:
			meanval=sum(l)/len(l)

		if len(valsunits)>1:
			return convert_unit_values(valsunits[1], meanval)
		return meanval

	except Exception as e:
		if pd.notna(x):
			printdisplay(MESSAGE_DEBUG,'floatarray_input_values invalid ' + str(x))
			printdisplay(MESSAGE_DEBUG,e)
			raise e
		#return float('nan')
		else:
			printdisplay(MESSAGE_DEBUG,'floatarray_input_values invalid ' + str(x))
		return x
	return x

def conv_index_to_bins(index):
	"""Calculate bins to contain the index values.
	The start and end bin boundaries are linearly extrapolated from 
	the two first and last values. The middle bin boundaries are 
	midpoints.

	Example 1: [0, 1] -> [-0.5, 0.5, 1.5]
	Example 2: [0, 1, 4] -> [-0.5, 0.5, 2.5, 5.5]
	Example 3: [4, 1, 0] -> [5.5, 2.5, 0.5, -0.5]"""
	assert index.is_monotonic_increasing or index.is_monotonic_decreasing

	# the beginning and end values are guessed from first and last two
	start = index[0] - (index[1]-index[0])/2
	end = index[-1] + (index[-1]-index[-2])/2

	# the middle values are the midpoints
	middle = pd.DataFrame({'m1': index[:-1], 'p1': index[1:]})
	middle = middle['m1'] + (middle['p1']-middle['m1'])/2

	if isinstance(index, pd.DatetimeIndex):
		idx = pd.DatetimeIndex(middle).union([start,end])
	elif isinstance(index, (pd.Float64Index,pd.RangeIndex,pd.Int64Index)):
		idx = pd.Float64Index(middle).union([start,end])
	else:
		printdisplay(MESSAGE_DEBUG,'Warning: guessing what to do with index type %s' % 
			  type(index))
		idx = pd.Float64Index(middle).union([start,end])

	return idx.sort_values(ascending=index.is_monotonic_increasing)

def calc_df_mesh(df):
	"""Calculate the two-dimensional bins to hold the index and 
	column values."""
	return np.meshgrid(conv_index_to_bins(df.index),
					   conv_index_to_bins(df.columns))

def sns_heatmap(df):
	"""Plot a heatmap of the dataframe values using the index and 
	columns"""
	X,Y = calc_df_mesh(df)
	c = plt.pcolormesh(X, Y, df.values.T)
	plt.colorbar(c)


def min_nonzero(df):
	return df[df > 0.0].min()


	

def rows_with_val_in_col(val,col):
	return [i for i, x in enumerate(c[col]==val) if x]


def check_shared_samples(df):

	
	dfdataset=df[df['property']=='phenotype_dataset']
	if dfdataset.shape[0]>0:
		#print(dfdataset)
		colums=dfdataset.columns.values.tolist()
		icnt=0
		msg=""
		#print(dfdataset.iloc[0,:].tolist())
		for ds in dfdataset.iloc[0,:].tolist():
			if pd.notna(ds) and "," in ds:
				msg+= colums[icnt] + " : (" + ds + "); "
			icnt+=1
		if msg:
			printdisplay(MESSAGE_INFO,"Sample shared by phenotype datasets: " + msg)

	dfdataset=df[df['property']=='stock_uniquename']
	if dfdataset.shape[0]>0:
		#print(dfdataset)
		colums=dfdataset.columns.values.tolist()
		icnt=0
		msg=""
		#print(dfdataset.iloc[0,:].tolist())
		for ds in dfdataset.iloc[0,:].tolist():
			if pd.notna(ds) and "," in ds:
				msg+= colums[icnt] + " : (" + ds + "); "
			icnt+=1
		if msg:
			printdisplay(MESSAGE_INFO,"Sample shared by multiple stocks: " + msg)


	dfdataset=df[df['property']=='phenotype_units']
	if dfdataset.shape[0]>0:
		colums=dfdataset.columns.values.tolist()
		icnt=0
		msg=""
		for ds in dfdataset.iloc[0,:].tolist():
			if pd.notna(ds) and "," in ds:
				msg+= colums[icnt] + " : (" + ds + "); "
			icnt+=1
		if msg:
			printdisplay(MESSAGE_INFO,"Samples with multiple phenotye units: " + msg)


	dfdataset=df[df['property']=='expression_dataset']
	if dfdataset.shape[0]>0:
		colums=dfdataset.columns.values.tolist()
		icnt=0
		msg=""
		for ds in dfdataset.iloc[0,:].tolist():
			if pd.notna(ds) and "," in ds:
				msg+= colums[icnt] + " : (" + ds + "); "
			icnt+=1
		if msg:
			printdisplay(MESSAGE_INFO,"Samples shared by expression_dataset datasets: " + msg)

	


def convert_units_to0(df, datatype, to_unit):
	global keep_unit
	keep_unit=to_unit

	check_shared_samples(df)

	sample_names=list(df.columns)
	try:
		sample_names.remove('datatype')
	except ValueError:
		pass	
	try:
		sample_names.remove('property')
	except ValueError:
		pass	

	c_converted_phenunits=df.copy()
	c_converted_phenunits_values=df.copy()
	phenotyperows=c_converted_phenunits['datatype'].str.startswith(datatype)
	try:
		c_converted_phenunits.loc[phenotyperows,sample_names]=c_converted_phenunits.loc[phenotyperows,sample_names].map(floatarray_input)
		c_converted_phenunits_values.loc[phenotyperows,sample_names]=c_converted_phenunits.loc[phenotyperows,sample_names].map(floatarray_input_values)
	except Exception as e:
		c_converted_phenunits.loc[phenotyperows,sample_names]=c_converted_phenunits.loc[phenotyperows,sample_names].applymap(floatarray_input)
		c_converted_phenunits_values.loc[phenotyperows,sample_names]=c_converted_phenunits.loc[phenotyperows,sample_names].applymap(floatarray_input_values)


	return  (c_converted_phenunits, c_converted_phenunits_values)


def read_url0(myurl,urlabel,requery=False):

	if not requery:

		if not os.path.exists(urlabel + '_lastquery.tsv'):
			requery=True
			printdisplay(MESSAGE_DEBUG,'no cached ' + urlabel + '_lastquery.tsv')
		else:
			printdisplay(MESSAGE_DEBUG,'using ' + urlabel + '_lastquery.tsv')

		if not requery:
			myurl=urlabel + '_lastquery.tsv'

	printdisplay(MESSAGE_DEBUG,'reading ' + myurl)

	index_col=None
	if myurl.endswith('_lastquery.tsv'):
		index_col=0

	printdisplay(MESSAGE_DEBUG,'urlabel=' + urlabel + '  myurl=' + myurl)
	c=pd.read_csv(myurl,sep='\t',index_col=index_col)

	if c.shape[0]==0:
		printdisplay(MESSAGE_DEBUG,list(c.columns.values))
		raise Exception(str(list(c.columns.values)))

	if not myurl==urlabel + '_lastquery.tsv':
		c.rename(columns = {c: c.strip() for c in c.columns}, inplace = True)
		if 'datatype' in list(c.columns.values):  
			c=c.sort_values(['datatype', 'property'], ascending = [True, True])
		c.to_csv(urlabel + '_lastquery.tsv',  sep="\t", encoding="utf-8")

	return c


def drop_allna_rowcol(df):
	df=df.dropna(axis=0, how='all')
	df=df.dropna(axis=1, how='all')
	return df

def negate_array(x):
	return ~x

def drop_allzero_rowcol(df):
	#df=df.loc[:, df.any()]
	df.loc[:, (df != 0.0).any(axis=0)]
	df=df.loc[~(df==0.0).all(axis=1)]



	return df


def drop_allzero_rowcol1(df):
	#df=df.loc[:, df.any()]
	df.loc[:, (df != 0).any(axis=0)]
	df=df.loc[~(df==0).all(axis=1)]
	return df

def drop_allnazero_rowcol(df):
	return drop_allna_rowcol(drop_allzero_rowcol(df))

def get_clean_samples(df,datatype):
	return drop_allnazero_rowcol(df.loc[df['datatype'].str.startswith(datatype)])

def get_properties_samples(df,datatype):
	samples=list(df.columns.values)
	properties=list(df.index.values)
	return (properties, samples)

def to_csv0(mydf, datatype,outf='outfile',transpose=False,annot=None,annnotsep=","):
		df = mydf.loc[mydf['datatype'].str.startswith(datatype)]
		df = df.drop(columns=['datatype'])
		df=df.set_index(['property'])
		if transpose:
			df=df.transpose()
			df.index.name='property'

		df=drop_allna_rowcol(drop_allzero_rowcol(df))
		#df=df.loc[:, df.any()]
		#df=df.loc[~(df==0).all(axis=1)]
		#df=df.dropna(axis=0, how='all')
		#df=df.dropna(axis=1, how='all')
		df.to_csv(outf,index=True)

		if transpose:
			samples=df.index.values.tolist()
		else:
			samples=df.columns.values.tolist()
			#samples.remove("property")

		if annot:
			printdisplay(MESSAGE_INFO,mydf,'mydf')
			df = mydf.loc[mydf['datatype'].isin([TYPE_ID,TYPE_PROP])]
			df=df.set_index('property')
			printdisplay(MESSAGE_INFO,df,'df_idprop')
			df=df[samples]
			df=df.transpose()
			df=df[annot]
			if 'NCBI BioProject' in df.columns.values.tolist():
				df['NCBI BioProject']=df['NCBI BioProject'].map(clean_property)

			printdisplay(MESSAGE_INFO,df,'df_annot')
			#df=df[annot]
			df.to_csv(outf +'.annot.csv',index=True,sep=annnotsep)

def do_be_eblm_values(dfvalues,inf,dfannot,covar,be_method):
	icnt=0
	#print(covar)
	#print(dfannot.columns.values.tolist())
	#display(dfannot)
	mapbatch2num=dict()
	for batch in list(set(dfannot[covar].tolist())):
		icnt+=1
		mapbatch2num[batch]=icnt
	# rename columns
	mapcol2num=dict()
	mapnum2col=dict()
	
	for column in dfvalues.columns.values.tolist():
		icnt+=1
		mapcol2num[column]='col_' + str(icnt)
		mapnum2col['col_' + str(icnt)]=column
	dfvalues=dfvalues.rename(columns=mapcol2num)



	if be_method=='eblm':
		dfvalues.to_csv(inf + '.eblm.data.csv',sep=",",index=True)
		dfannot[covar].to_csv(inf + '.eblm.annot.csv',sep=",",index=True)
		ossystem("Rscript Langfelder-eblm.R " + inf + " " + inf + '.eblm.data.csv' + " " + inf + '.eblm.annot.csv ' + covar )
		df_corr=pd.read_csv(inf + '.eblm.csv',sep=",") #,index_col=0)
	elif be_method=='rcombat':
		dfvalues.to_csv(inf + '.rcombat.data.csv',sep=",",index=True)
		dfannot[covar].to_csv(inf + '.rcombat.annot.csv',sep=",",index=True)
		ossystem("Rscript Langfelder-combat.R " + inf + " " + inf + '.rcombat.data.csv' + " " + inf + '.rcombat.annot.csv ' + covar )
		df_corr=pd.read_csv(inf + '.rcombat.csv',sep=",") #,index_col=0)


	df_corr['index']=df_corr.iloc[:,0]-1
	df_corr=df_corr.set_index('index')
	df_corr=df_corr.drop(labels='Unnamed: 0',axis=1)
	#df_corr=df_corr.drop(labels=0,axis=1)
	df_corr=df_corr.rename(columns=mapnum2col)
	return df_corr

def do_be_eblm(df,dflabel,datatype,annot, covar,cleaned=False):
	to_csv0(df,datatype,outfile,annot)
	ossystem("Rscript Langfelder-eblm.R " + dflabel + " " + outf + " " + outf +'.annot.csv ' + covar )
	df_corr=pd.read_csv(outf + '.eblm.csv',sep=",",index_col=0)
	df_corr=df_corr.transpose()
	df_corr_all=df[df['datatype']!=datatype]
	df_corr_all=pd.concat([df_corr_all, df_corr])
	return df_corr_all



def shiftTableHeaderTableElements(file):
	with open(file) as fin,open(file+'.shifted.txt','wt') as fout:
		line=fin.readline().strip()
		fout.write("\t"+line+"\n")
		line=fin.readline()
		while line:
			fout.write(line)
			line=fin.readline()

def sortedTableElementsCorP(file,topn=10,recalc=False):
	pfile=file.replace('.cor.','.corPvalue.')
	if not os.path.exists(file+'.shifted.txt') or recalc:
		shiftTableHeaderTableElements(file)
	if not os.path.exists(pfile+'.shifted.txt') or recalc:
		shiftTableHeaderTableElements(pfile)


def sortedTableElements(file,cortopn=None,maxP=1e-10,mincor=0.8,recalc=False):
	pfile=file.replace('.cor.','.corPvalue.')
	if not os.path.exists(file+'.shifted.txt') or recalc:
		shiftTableHeaderTableElements(file)
	if not os.path.exists(pfile+'.shifted.txt') or recalc:
		shiftTableHeaderTableElements(pfile)

	toplist=[]
	with open(file+'.shifted.txt') as fin, open(pfile+'.shifted.txt') as  pfin, open(file+'.top.txt','wt') as fout:
		fout.write("module\ttrait\tcor\tpvalue\n")
		line=fin.readline().rstrip()
		linep=pfin.readline().rstrip()
		colspheader=[x.replace("\"","") for x in linep.split("\t")]

		if linep!=line:
			printdisplay(MESSAGE_DEBUG,file+'.shifted.txt and '+ pfile+'.shifted.txt  headers did not match')
			raise Exception('sortedTableElements1')

		headers=[x.replace("\"","") for x in line.split("\t")]
		line=fin.readline().rstrip()
		linep=pfin.readline().rstrip()
		absval2pair=dict()
		val2pair=dict()
		pair2pvalue=dict()
		pair2val=dict()
		while linep:
			colsp=linep.split("\t")
			for icol in range(1,len(colsp)):
				try:
					fval=float(colsp[icol])
					pair2pvalue[(colsp[0].replace("\"ME","").replace("\"",""),colspheader[icol])]=fval
				except Exception as ex:
					if colsp[icol]!='NA':
						printdisplay(MESSAGE_DEBUG,str(ex))
						raise ex
			linep=pfin.readline().rstrip()

		printdisplay(MESSAGE_DEBUG,'pair2pvalue=' + str(len(pair2pvalue)))
		printdisplay(MESSAGE_DEBUG,str(list(pair2pvalue.keys())[:5]))

		while line:
			cols=line.split("\t")
			#if cols[0]!=colsp[0]:
			#	printdisplay(MESSAGE_DEBUG,file+'.shifted.txt and '+ pfile+'.shifted.txt lines ' + cols[0] + '!=' + colsp[0]  + ' did not match')
			#	raise Exception('sortedTableElements2')
			for icol in range(1,len(cols)):
				try:
					fval=float(cols[icol])
					includevalue=True
					colspicol=None
					pair=(cols[0].replace("\"ME","").replace("\"",""),headers[icol])
					if pair in pair2pvalue:
						colspicol=pair2pvalue[pair]
					else:
						printdisplay(MESSAGE_DEBUG,str(pair) + ' not pair2pvalue')
						includevalue=False
					if includevalue and maxP and colspicol and colspicol>maxP:
						printdisplay(MESSAGE_DEBUG,'not maxP and colspicol and colspicol>maxP  ' + str(colspicol) + ' > ' +  str(maxP) )
						includevalue=False
					if includevalue and mincor and abs(float(cols[icol]))<mincor:
						printdisplay(MESSAGE_DEBUG,'not mincor and float(cols[icol])<mincor  ' +str(cols[icol]) + ' < ' + str(mincor) )
						includevalue=False
					if includevalue:
						if not fval in val2pair:
							val2pair[fval]=[]
						if not abs(fval) in absval2pair:
							absval2pair[abs(fval)]=[]
						modname=cols[0].replace("\"ME","").replace("\"","")
						traitname=headers[icol]
						val2pair[fval].append((modname,traitname))
						absval2pair[abs(fval)].append((modname,traitname))
						pair2val[(modname,traitname)]=fval

						#printdisplay(MESSAGE_DEBUG,'include ' + modname + ' ' + traitname + ' ' + str(fval))
						#pair2pvalue[(modname,traitname)]=float(colsp[icol])
				except Exception as ex:
					if cols[icol]!='NA':
						printdisplay(MESSAGE_DEBUG,str(ex))
						print('colspicol=' + str(colspicol))
						print('maxP=' + str(maxP))
						print('mincor=' + str(mincor))
						raise ex
			line=fin.readline().rstrip()

		printdisplay(MESSAGE_DEBUG,'val2pair=' + str(len(val2pair)))
		printdisplay(MESSAGE_DEBUG,str(list(val2pair.keys())[:5]))

		#exit()

		ivalcnt=0
		for ival in sorted(list(absval2pair.keys()),reverse=True):
			ivalcnt+=1
			try:
				if cortopn and ivalcnt>cortopn:
					break
				for ipairs in absval2pair[ival]:
					fout.write(ipairs[0]+"\t"+ipairs[1]+"\t"+str(pair2val[ipairs])+"\t" + str(pair2pvalue[ipairs]) + "\n")
					toplist.append((ipairs[0],ipairs[1],pair2val[ipairs],pair2pvalue[ipairs]))
			except Exception as ex:
				printdisplay(MESSAGE_INFO, str(ex))
				print('cortopn=' + str(cortopn))
				print('ivalcnt=' + str(ivalcnt))
				raise ex

		printdisplay(MESSAGE_DEBUG,'generated ' + file+'.top.txt')
		printdisplay(MESSAGE_DEBUG,'toplist='+ str(len(toplist)))
	return toplist

# requires R with WGCNA package
# based on R scripts from #https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html
def do_wgcna_prepare0(mydf,dflabel,datatypex=TYPE_EXP,datatypey=TYPE_PHEN,recalc=False,cortopn=None,maxP=1e-10,mincor=0.8,exp_file=None):
	#printdisplay(MESSAGE_DEBUG,'do_wgcna_prepare ' + dflabel)
	printdisplay(MESSAGE_DEBUG,mydf,'do_wgcna_prepare ' + dflabel)

	if recalc:
		clean_project(dflabel)


	if not os.path.exists(dflabel + '-01-dataInput.RData') or recalc:
		if exp_file:
			df_expfile=pd.read_csv(exp_file,sep="\t",index_col=None)
			df_cleanphen=get_clean_samples(mydf,TYPE_PHEN)
			printdisplay(MESSAGE_DEBUG, df_cleanphen,'df_cleanphen')
			hasphen= set(df_expfile.columns.values.tolist()).intersection(set(df_cleanphen.columns.values.tolist()))
			hasphen.discard('datatype')		
			hasphen.discard('property')
			printdisplay(MESSAGE_INFO,'df_expfile=' + str(df_expfile.shape))
			printdisplay(MESSAGE_INFO,'df_cleanphen=' + str(df_cleanphen.shape))
			printdisplay(MESSAGE_INFO, exp_file + ' has ' + str(len(hasphen)) + ' common samples with phenotypes')		
			df_expfile_hasphen=df_expfile[['datatype','property'] + sorted(list(hasphen))]
			df_cleanphen_hasexp=mydf[['datatype','property'] + sorted(list(hasphen))]

			printdisplay(MESSAGE_DEBUG, df_cleanphen_hasexp,'df_cleanphen_hasexp')
			to_csv0(df_cleanphen_hasexp, datatype=datatypey,outf=dflabel + '.trait.csv',transpose=True,annot=['phenotype_dataset','NCBI BioProject'])
			to_csv0(df_expfile_hasphen, datatype=datatypex,outf=dflabel + '.genes.csv',annot=['NCBI BioProject','expression_dataset'])
		else:
			[dummy1,dummy2,mydf]=verify_samples0(mydf, datatype1=datatypex, datatype2=datatypey, common_only=True)
			to_csv0(mydf, datatype=datatypey,outf=dflabel + '.trait.csv',transpose=True,annot=['phenotype_dataset','NCBI BioProject'])
			to_csv0(mydf, datatype=datatypex,outf=dflabel + '.genes.csv',annot=['NCBI BioProject','expression_dataset'])

		cmd='Rscript Langfelder-01-dataInput.R ' + dflabel + ' ' + dflabel +'.genes.csv ' + dflabel + ' ' + dflabel + '.trait.csv'
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)

	if not os.path.exists(dflabel + '-02-networkConstruction-auto.RData') or recalc:
		cmd='Rscript Langfelder-02-networkConstr-auto.R ' + dflabel
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)
		pdf2png(dflabel+".ScaleAnalysis.pdf")
		pdf2png(dflabel+".ClusterDendogram.pdf")

	pltshow(MESSAGE_INFO,dflabel+".ScaleAnalysis.png")
	pltshow(MESSAGE_INFO,dflabel+".ClusterDendogram.png")


	cmd='Rscript Langfelder-03a-relateModsToExt.R ' + dflabel 
	printdisplay(MESSAGE_DEBUG,cmd)
	ossystem(cmd)
	printdisplay(MESSAGE_DEBUG,'check modules in ' + dflabel + '.ModuleTraitRelationship.pdf')

	printdisplay(MESSAGE_DEBUG,'get module details usng do_wgcna_visualize')
	

	toplist=sortedTableElements(dflabel+".ModuleTraitRelationship.cor.txt",cortopn=cortopn,maxP=maxP,mincor=mincor,recalc=recalc)
	printdisplay(MESSAGE_DEBUG,'top correlated modules in ' +dflabel +".ModuleTraitRelationship.cor.txt.top" )

	pdf2png(dflabel+".ModuleTraitRelationship.pdf")
	pltshow(MESSAGE_INFO, dflabel+".ModuleTraitRelationship.png")


	return toplist



def do_wgcna_modulesgenes0(mydf,dflabel='df4wgcna',datatypex='4 EXP',datatypey=TYPE_PHEN,recalc=False,moduletrait=['brown,Total_cannabinoids'],annotfle="cs10-genes3.csv",gstopn=None,maxP=1e-10,mings=0.8,annotate=False):
	printdisplay(MESSAGE_DEBUG,'do_wgcna_modulesgenes ' + dflabel)


	if not os.path.exists(dflabel + '-03a-relateModsToExt.RData') or recalc:
		 do_wgcna_prepare0(mydf,dflabel,datatypex=datatypex,datatypey=datatypey,recalc=recalc)
	
	#annotfle="21trichpk-genes3.csv"
	# process first unique trait only
	hastrait=set()
	newmoduletrait=[]
	for modtrait in moduletrait:
		cols=modtrait.split(",",1)
		if not cols[1] in hastrait:
			newmoduletrait.append(modtrait)
			hastrait.add(cols[1])

	moduletrait=newmoduletrait


	if not os.path.exists(dflabel+'.wgcna.top.tsv'):
		cmd='Rscript Langfelder-03b-relateModsToExt.R ' + dflabel + ' ' + annotfle + " " + " ".join(moduletrait)
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)

		# read genes
		#df=ps.read_csv(21trichpk_zager-Total_monoterpenes-geneInfo.csv
		traitlist=[x.split(",")[1] for x in  moduletrait]


		GS=[]
		pGS=[]
		#=set()
		genetrait=[]
		pdgenepass=None
		for modtraits in moduletrait:
			modtrait=modtraits.split(",")
			if True:
			#if not (modtrait[0] in moduledone):
				if os.path.exists(dflabel + "-" + modtrait[1] +"-geneInfo.csv"):
					dfgeneInfo=pd.read_csv(dflabel + "-" + modtrait[1]  +"-geneInfo.csv" ,sep=',',index_col=0)
					#printdisplay(MESSAGE_DEBUG,"\n" + modtraits)

	#gstopn=1000,maxP=1e-10,mings=0.8,
	#gstopn=1000,maxP=1e-10,mings=0.8,
	#"substanceBXH","geneSymbol","LocusLinkID","moduleColor","GS.nerolidol","p.GS.nerolidol"
					gstrait="GS." + modtrait[1]
					pgstrait="p.GS." + modtrait[1]

					printdisplay(MESSAGE_DEBUG,dfgeneInfo)

					dfgeneInfo=dfgeneInfo[['substanceBXH',gstrait, pgstrait]]
					GS=GS+ dfgeneInfo[gstrait].tolist()
					pGS=pGS+ dfgeneInfo[pgstrait].tolist()

					printdisplay(MESSAGE_DEBUG,dfgeneInfo)

					dfgeneInfo['absgstrait']=dfgeneInfo[gstrait].abs()
					#dfgeneInfo = dfgeneInfo.assign(absgstrait=lambda x: x.Fee * x.Discount / 100)
					#df['Discount_rating'] = np.where(df['Discount'] > 2000, 'Good', 'Bad')

					printdisplay(MESSAGE_DEBUG,dfgeneInfo)

					#genelist=dfgeneInfo[(dfgeneInfo["moduleColor"]) & (dfgeneInfoPass[gstrait]>mings) & (dfgeneInfoPass[pgstrait]<maxP ==modtrait[0])].sort_values([gstrait],ascending=False).head(gstopn).index.values.tolist()
					dfgeneInfoPass=dfgeneInfo[(dfgeneInfo['absgstrait']>mings) & (dfgeneInfo[pgstrait]<maxP) ].sort_values(['absgstrait'],ascending=False)
					if gstopn:
						dfgeneInfoPass=dfgeneInfoPass.head(gstopn)

					genelist=dfgeneInfoPass.index.values.tolist()

					genelist=sorted(list(set(genelist)))

					dfgeneInfoPass = dfgeneInfoPass.assign(trait = modtrait[1])
					dfgeneInfoPass = dfgeneInfoPass.rename(columns={gstrait: "GS", pgstrait: "p.GS"})
					dfgeneInfoPass = dfgeneInfoPass[['substanceBXH','trait','GS','p.GS']]
					if pdgenepass is None:
						pdgenepass=dfgeneInfoPass
					else:
						pdgenepass=pd.concat([pdgenepass,dfgeneInfoPass])

					#genelist=sorted(dfgeneInfo[dfgeneInfo["moduleColor"]==modtrait[0]].index.values.tolist())
					#printdisplay(MESSAGE_DEBUG,genelist)
					#get_genes_by_accession0(genelist,dflabel + "-" +modtraits[1] +"-geneInfo-annot.csv")
					#printdisplay(MESSAGE_DEBUG,dflabel + "-" + modtraits +"-geneInfo-annot.csv")

					pdf2png(dflabel+".MM_"+modtrait[0]+"_vs_GS_"+modtrait[1]+".pdf")

					cmd="cp " + dflabel+".MM_"+modtrait[0]+"_vs_GS_"+modtrait[1]+".png " + dflabel+".MM_"+modtrait[0] + ".png "
					printdisplay(MESSAGE_DEBUG,cmd)
					ossystem(cmd)

					#moduledone.add(modtrait[0])

					#if verbose:
					#	printdisplay(MESSAGE_DEBUG,pd.read_csv(dflabel + "-" +modtrait[0] +"-geneInfo-annot.csv"))
				else:
					printdisplay(MESSAGE_DEBUG,'file not found ' + dflabel + "-" + modtrait[0] +"-geneInfo.csv")

	#




		pdgenepass.to_csv(dflabel +'.relateModsToExt.top.genetrait.tsv',sep="\t",index=False)
		printdisplay(MESSAGE_DEBUG,'generated ' + dflabel +'.relateModsToExt.top.genetrait.tsv')
		plt.clf()
		plt.hist(GS, bins=20, alpha=0.5)
		plt.xlabel('GS, gene significance')
		plt.ylabel('count gene x trait, all modules')
		plt.savefig(dflabel +'.relateModsToExt.GS.png',dpi=300,format='png')
		plt.clf()
		plt.hist([-math.log10(x) for x in pGS], bins=20, alpha=0.5)
		plt.xlabel('-log10 pvalue GS gene sgnificance')
		plt.ylabel('count gene x trait, all modules')
		plt.savefig(dflabel +'.relateModsToExt.pGS.png',dpi=300,format='png')
		plt.clf()
		printdisplay(MESSAGE_INFO,'generated ' + dflabel +'.relateModsToExt.GS.png')
		printdisplay(MESSAGE_INFO,'generated ' + dflabel +'.relateModsToExt.pGS.png')

		pltshow(MESSAGE_INFO,dflabel +'.relateModsToExt.GS.png')
		pltshow(MESSAGE_INFO,dflabel +'.relateModsToExt.pGS.png')


		printdisplay(MESSAGE_DEBUG,pdgenepass)

		osmv("Rplots.pdf " +  dflabel + '.MEvsGS.pdf')
		printdisplay(MESSAGE_INFO,'generated ' + dflabel + '.MEvsGS.pdf')
		pdf2png(dflabel + '.MEvsGS.pdf')
		printdisplay(MESSAGE_INFO,'generated ' + dflabel + '.MEvsGS.png')

		pdgenepassGeneAnnot=pdgenepass
		if annotate:
			pdgenepassGeneAnnot=get_genes_by_accession0(list(set(pdgenepass['substanceBXH'].tolist())), annot='all')
			pdgenepassGeneAnnot.to_csv(dflabel +'.relateModsToExt.top.genetrait.genes.tsv',sep="\t",index=False)
			printdisplay(MESSAGE_DEBUG,pdgenepassGeneAnnot)

			pdgenepassGeneAnnot=pdgenepassGeneAnnot.rename(columns={"name":"substanceBXH"})

			printdisplay(MESSAGE_DEBUG,pdgenepassGeneAnnot)


			pdgenepassWithGeneAnnot=pd.merge(pdgenepass, pdgenepassGeneAnnot, on="substanceBXH", how="left")
			pdgenepassWithGeneAnnot=pdgenepassWithGeneAnnot.rename(columns={"substanceBXH":"expgenes"})
			#pdgenepassWithGeneAnnot.to_csv(dflabel +'.relateModsToExt.top.genetrait.annot.tsv',sep="\t",index=False)
			pdgenepassWithGeneAnnot.to_csv(dflabel +'.wgcna.top.annot.tsv',sep="\t",index=False)
			#printdisplay(MESSAGE_INFO,pdgenepassWithGeneAnnot)
			#printdisplay(MESSAGE_INFO,'generated ' + dflabel +'.wgcna.top.annot.tsv')

		#dfgeneInfoPass = dfgeneInfoPass[['substanceBXH','trait','GS','p.GS']]
		#dfgeneInfoPass =dfgeneInfoPass.rename(columns={'substanceBXH':'expgene'})
		#return dfgeneInfoPass

		pdgenepass = pdgenepass[['substanceBXH','trait','GS','p.GS']]
		pdgenepass =pdgenepass.rename(columns={'substanceBXH':'expgenes'})
		pdgenepass.to_csv(dflabel+'.wgcna.top.tsv',sep="\t",index=False)
		printdisplay(MESSAGE_INFO,'generated ' + dflabel+'.wgcna.top.tsv')
	else:
		pdgenepass=pd.read_csv(dflabel+'.wgcna.top.tsv',sep="\t")

	if os.path.exists(dflabel +'.wgcna.top.annot.tsv'):
		printdisplay(MESSAGE_INFO,dflabel+'.wgcna.top.annot.tsv','top expressed gene-trait annotated ' + dflabel+'.wgcna.top.annot.tsv')
	else:
		printdisplay(MESSAGE_INFO,dflabel+'.wgcna.top.tsv','top expressed gene-trait ' + dflabel+'.wgcna.top.tsv')


	return pdgenepass


def nodeedge2cytoscape():
	return None


# requires R with WGCNA package
# based on R scripts from #https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html
def do_wgcna_visualize0(mydf,dflabel,datatypex,datatypey,recalc,moduletrait,exportNetwor):
	printdisplay(MESSAGE_DEBUG,'do_wgcna_visualize ' + dflabel)

	if not os.path.exists(dflabel + '-02-networkConstruction-auto.RData') or recalc:
		 do_wgcna_prepare0(mydf,dflabel,datatypex=datatypex,datatypey=datatypey,recalc=recalc)

	traitlist=sorted(set([x.split(",")[1] for x in  moduletrait]))
	modlist=sorted(set([x.split(",")[0] for x in  moduletrait]))

	cmd='Rscript Langfelder-05-Visualization.R ' + dflabel + ' ' + " ".join(traitlist)
	printdisplay(MESSAGE_DEBUG,cmd)
	ossystem(cmd)

	for trait_select in traitlist:
		pdf2png(dflabel+"-"+trait_select+".NetworkHeatmapPlot.pdf")
		pdf2png(dflabel+"-"+trait_select+".EigengeneDendrogram.pdf")
		pdf2png(dflabel+"-"+trait_select+".EigengeneAdjacencyHeatmap.pdf")

	if exportNetwork:
		cmd='Rscript Langfelder-06-ExportNetwork.R ' + dflabel + ' ' + " ".join(modlist)
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)




def plot_histograms0(mydf, myunit,dslabel):
	df_phen = mydf.loc[mydf['datatype'].str.startswith(TYPE_PHEN)]

	#df_units= mydf.loc[mydf['property'].str.startswith('phenotype_units')] 
	#printdisplay(MESSAGE_DEBUG,df_phen)
	#printdisplay(MESSAGE_DEBUG,df_units)



	plotcols=5
	plotrows= int(df_phen.shape[0]/plotcols)+1

	#fig = plt.figure(figsize=(10,10))

	fig, ax = plt.subplots(nrows=plotrows, ncols=plotcols, figsize=(imgsize_xlarge_in[0], imgsize_xlarge_in[1]))
	#fig, ax = plt.subplots(nrows=,  figsize=(10,200))
	#fig, ax = plt.subplots(nrows=int( (df_phen.shape[0]-1)/5)+1,ncols=5,  figsize=(100,100))
	plt.tight_layout()

	for i in range(df_phen.shape[0]):

		irow=int(i/plotcols)
		icol=i % plotcols
		df_phen.iloc[i,2:].hist(ax=ax[irow,icol], alpha=0.4)

	

		#ax[irow,icol].set_title(df_phen.iloc[i, df_phen.columns.get_loc('property')]  +  ' (' +  df_units.iloc[0, df_phen.columns.get_loc('property')] + ')' )
		ax[irow,icol].set_title(df_phen.iloc[i, df_phen.columns.get_loc('property')]) #  +  ' (' +  df_units.iloc[0, df_phen.columns.get_loc('property')] + ')' )

	fig.suptitle('x: ' + myunit + ' ,y: frequency')
	plt.savefig(dslabel +'_phenotypes_' + myunit + '.png')


	
def plot_histograms_multi0(mydfs, myunit,dslabels,phenminmaxbin=None,heatmap=False, maporder='phen-source',label2color=dict(),nbins=10,bydataset=None):



	label_url=dslabels
	dfs_phen=[]
	phen2i=dict()
	label2dfphen=dict()

	setSamples=[]

	idf=0
	ds2sampleslist=dict()
	for mydf in mydfs:
		printdisplay(MESSAGE_DEBUG,'processing plot_histograms_multi0 ' + dslabels[idf])
		df_phen=mydf.loc[mydf['datatype'].str.startswith(TYPE_PHEN)]
		label2dfphen[dslabels[idf]]=df_phen

		#get union of phenotypes, and match their i's
		for i in range(df_phen.shape[0]):
			phenname=df_phen.iloc[i, df_phen.columns.get_loc('property')] #df_phen.columns.get_loc('property')
			if phenname in phen2i:
				phen2i[phenname].append((i, dslabels[idf]))
			else:
				phen2i[phenname]=[(i,dslabels[idf])]


		samplessrc=[]
		for s in  list(df_phen.columns)[2:]:
			samplessrc.append(s + '&' + dslabels[idf])

		setSamples=setSamples+samplessrc
		ds2sampleslist[dslabels[idf]]=samplessrc

		idf+=1

	nphens=len(phen2i)

	setSamples=list(sorted(set(setSamples)))
	#printdisplay(MESSAGE_DEBUG,'all samples=' + str(setSamples))


	#df_units= mydf.loc[mydf['property'].str.startswith('phenotype_units')] 
	#printdisplay(MESSAGE_DEBUG,df_phen)
	#printdisplay(MESSAGE_DEBUG,df_units)


	plotcols=5
	plotrows= int(nphens/plotcols)+1

	#fig = plt.figure(figsize=(10,10))

	fig, ax = plt.subplots(nrows=plotrows, ncols=plotcols, figsize=(imgsize_xlarge_in[0], imgsize_xlarge_in[1])) #,sharex='col', sharey='row')

	fig2, ax2 = plt.subplots(nrows=plotrows, ncols=plotcols, figsize=(imgsize_xlarge_in[0], imgsize_xlarge_in[1])) #,sharex='col', sharey='row')

	#fig, ax = plt.subplots(nrows=,  figsize=(10,200))
	#fig, ax = plt.subplots(nrows=int( (df_phen.shape[0]-1)/5)+1,ncols=5,  figsize=(100,100))


	phen2samplesrc2val=dict()
	phen2samplesrc2vallist=[]

	phensrc2sample2vallist=[]
	phensrclist=[]

	iphen=0
	for phen in list(phen2i.keys()):

		#plt.subplots(nrows=plotrows, ncols=plotcols, i)
		#df_phen.iloc[i,2:].hist(ax=ax[i-1],color = 'k', alpha=0.4)
		#df_phen.iloc[i,2:].hist(ax=ax[i-1], alpha=0.4)
		irow=int(iphen/plotcols)
		icol=iphen % plotcols
		#df_phen.iloc[i,2:].hist(ax=ax[irow,icol], alpha=0.4)

		idflabel_list = phen2i[phen]

		minmaxbin=None
		if phenminmaxbin and phen in phenminmaxbin:
			minmaxbin=phenminmaxbin[phen]


		x_multi=[]
		x_multi_last=None
		colors=[]
		labels=[]
		samplesrc2val=dict()
		sample2val=dict()
		for (i, dflabel) in idflabel_list:
			#printdisplay(MESSAGE_DEBUG,str(i) + ' ' + dflabel)
			df_phen=label2dfphen[dflabel]

			dfrow=df_phen.iloc[i,2:]


			'''
			if not minmaxbin:
			#if False:
				if not (minmaxbin[0] is None)  and  not (minmaxbin[1] is None):
					#dfrow=dfrow[(dfrow >= minmaxbin[0]) & (dfrow <= minmaxbin[1])].dropna(axis=1)
					dfrow=dfrow[(dfrow >= minmaxbin[0]) & (dfrow <= minmaxbin[1])].dropna(axis=1)
				elif not (minmaxbin[0] is None):
					dfrow=dfrow[dfrow >= minmaxbin[0]].dropna(axis)
				elif not (minmaxbin[1] is None):
					dfrow=dfrow[dfrow <= minmaxbin[1] ].dropna()
			'''

			phenslist=dfrow.to_numpy().tolist()
			x_multi.append(phenslist)


			if len(ds2sampleslist[dflabel])!=len(phenslist):
				raise Exception('len(ds2sampleslist[dflabel])!=len(phenslist)  ' + str(len(ds2sampleslist[dflabel])) + ' ' + len(phenslist))

			sampleslist=ds2sampleslist[dflabel]
			sample2val=dict()
			for ival in range(len(sampleslist)):
				samplesrc=sampleslist[ival]
				samplesrc2val[samplesrc]=phenslist[ival]
				samplename=samplesrc.split("&")[0]
				sample2val[samplename]=phenslist[ival]
				#printdisplay(MESSAGE_DEBUG,phen + ' ' + dflabel + ' ' + sampleslist[ival] + ' ' + str(phenslist[ival]))
			#x_multi.append(df_phen.iloc[i,2:].to_numpy())

			x_multi_last=dfrow
			if label2color and len(label2color)>0 and dflabel in label2color:
				colors.append(label2color[dflabel])
			else:
				label2color=dict()
				colors=[]

			labels.append(dflabel)

			phensrclist.append(phen + '&' + dflabel)
			phensrc2sample2vallist.append(sample2val)
			

		phen2samplesrc2val[phen]=samplesrc2val
		phen2samplesrc2vallist.append(samplesrc2val)

		


		#printdisplay(MESSAGE_DEBUG,'x_multi.shape')
		#printdisplay(MESSAGE_DEBUG,type(x_multi))
		#for imulti in x_multi:
			#printdisplay(MESSAGE_DEBUG,"\t" + str(type(imulti)))	


		if len(colors)==0:
			color=None
		mybins=nbins
		if minmaxbin and not (minmaxbin[2] is None):
			mybins=minmaxbin[2]


		if minmaxbin and  (minmaxbin[0] or minmaxbin[1]): 

			if True:
				curmax=x_multi[0].max() # float('-inf')
				curmin=x_multi[0].min() #float('inf')
				for dfset in x_multi[1:]:
					fmin=dfset.min()
					if fmin<curmin:
						curmin=fmin
					fmax=dfset.max()
					if fmax>curmax:
						curmax=fmax

				if minmaxbin[0]  and minmaxbin[1]:
					curmax=minmaxbin[1]
					curmin=minmaxbin[0]
				elif minmaxbin[1]:
					curmax=minmaxbin[1]
				elif minmaxbin[0]:
					curmin=minmaxbin[0]

			if colors:
				ax[irow,icol].hist( x_multi, mybins, histtype='bar',color=colors,label=labels,range=(curmin , curmax)) #, alpha=0.4)
				ax2[irow,icol].hist( x_multi, mybins,  density=True, histtype='bar',color=colors,label=labels,range=(curmin , curmax)) #, stacked=True) #, alpha=0.4)
			else:
				ax[irow,icol].hist( x_multi, mybins, histtype='bar',label=labels,range=(curmin , curmax)) #, alpha=0.4)
				ax2[irow,icol].hist( x_multi, mybins,  density=True, histtype='bar',label=labels,range=(curmin , curmax)) #, stacked=True) #, alpha=0.4)
		else:
			#plt.subplot(plotrows, plotcols, fig)
			if colors:
				ax[irow,icol].hist( x_multi, mybins, histtype='bar',color=colors,label=labels) #, alpha=0.4)
				ax2[irow,icol].hist( x_multi, mybins, density=True, histtype='bar',color=colors,label=labels) #, stacked=True)  #, alpha=0.4)
			else:
				ax[irow,icol].hist( x_multi, mybins, histtype='bar',label=labels) #, alpha=0.4)
				ax2[irow,icol].hist( x_multi, mybins, density=True, histtype='bar',label=labels) #, stacked=True)  #, alpha=0.4)



		ax2[irow,icol].legend(fontsize='xx-small',title_fontsize='xx-small',framealpha=0.1) #prop={'size': 6})
		ax[irow,icol].legend(fontsize='xx-small',title_fontsize='xx-small',framealpha=0.1) #prop={'size': 6})


		#ax[irow,icol].set_title(df_phen.iloc[i, df_phen.columns.get_loc('property')]  +  ' (' +  df_units.iloc[0, df_phen.columns.get_loc('property')] + ')' )
		ax[irow,icol].set_title(phen) #  +  ' (' +  df_units.iloc[0, df_phen.columns.get_loc('property')] + ')' )
		ax2[irow,icol].set_title(phen + ' density')

		iphen+=1

	fig.suptitle('x: ' + myunit + ' ,y: frequency')
	
	plt.figure(num=fig.number)
	plt.tight_layout()
	plt.savefig(str(len(dslabels)) + '_' + str(hash(str(dslabels))) +'_phenotypes_' + myunit + '.png')
	pltshow(MESSAGE_INFO,'generated ' + str(len(dslabels)) + '_' + str(hash(str(dslabels))) +'_phenotypes_' + myunit + '.png')

	fig2.suptitle('x: ' + myunit + ' ,y: frequency')
	plt.figure(num=fig2.number)
	plt.tight_layout()
	plt.savefig(str(len(dslabels)) + '_' + str(hash(str(dslabels))) +'_phenotypes_' + myunit + '_density.png')
	pltshow(MESSAGE_INFO,'generated ' + str(len(dslabels)) + '_' + str(hash(str(dslabels))) +'_phenotypes_' + myunit + '_density.png')

	plt.clf()

	if heatmap:

		#printdisplay(MESSAGE_DEBUG,phen2samplesrc2val)
		dfHeatmap=pd.DataFrame(phen2samplesrc2vallist,list(sorted(phen2i.keys())), setSamples).dropna(how='all').transpose().dropna(how='all') #.fillna(-1)
		#printdisplay(MESSAGE_DEBUG,dfHeatmap)
		#dfHeatmap=dfHeatmap.dropna(how='all')
		
		fig = plt.figure(figsize=(imgsize_xlarge_in[0], imgsize_xlarge_in[1]))

		if False:
			a=dfHeatmap.to_numpy().astype(float)
			printdisplay(MESSAGE_DEBUG,a)
			printdisplay(MESSAGE_DEBUG,np.isnan(a))
			printdisplay(MESSAGE_DEBUG,np.any(np.isnan(a)))
			printdisplay(MESSAGE_DEBUG,np.all(np.isnan(a)))
			sns.heatmap(np.isnan(a))


		hmfontsize='xx-small'


		cmap = cm.jet
		cmap = copy.copy(cm.get_cmap("jet"))
		cmap.set_bad('white',1.)
		cmap.set_under('gray',1.0)
		cmap.set_over('black',1.0)

		mesh1=plt.pcolormesh(dfHeatmap, cmap=cmap, norm=pltcolors.LogNorm())
		plt.yticks(np.arange(0.5, len(dfHeatmap.index), 1), dfHeatmap.index)
		plt.subplots_adjust(left=0.15, right=0.98, top=0.98, bottom=0.1)
		#plt.tight_layout()
		plt.xticks(rotation=90,fontsize=hmfontsize)
		plt.yticks(ha='right',fontsize=hmfontsize)
		plt.xticks(np.arange(0.5, len(dfHeatmap.columns), 1), dfHeatmap.columns)
		
		fig.colorbar(mesh1)

		pltshow(MESSAGE_INFO,plt)



		printdisplay(MESSAGE_DEBUG,'heatmap 1 -by_sample_dataset')
		#c=plt.imshow()
		plt.savefig("-".join(label_url) + '.phen-heatmap-by_sample_dataset.png',dpi=300)

		#for phensamplesrc in phensamplesrc2val.keys():
		#	v=phensamplesrc2val[phensamplesrc]
		dfHeatmap.to_csv( "-".join(label_url) + '.phen2samplesrc-by_sample_dataset.txt' , sep="\t")
		printdisplay(MESSAGE_INFO,"generated " +  "-".join(label_url) + '.phen2samplesrc-by_sample_dataset.txt')

		#printdisplay(MESSAGE_DEBUG,a)
		#printdisplay(MESSAGE_DEBUG,np.isnan(a))


		printdisplay(MESSAGE_DEBUG,'heatmap 2 -by_trait_dataset')
		samplenames=set()
		for namesrc in setSamples:
			samplenames.add(namesrc.split("&")[0])
		samplenames=list(sorted(samplenames))
		dfHeatmap=pd.DataFrame(phensrc2sample2vallist,phensrclist, samplenames).dropna(how='all').transpose().dropna(how='all')  #.fillna(-1)
		#printdisplay(MESSAGE_DEBUG,dfHeatmap)
		#dfHeatmap=dfHeatmap.dropna(how='all')
		fig = plt.figure(figsize=(imgsize_xlarge_in[1], imgsize_xlarge_in[0]))
		mesh1=plt.pcolormesh(dfHeatmap, cmap=cmap, norm=pltcolors.LogNorm())
		plt.yticks(np.arange(0.5, len(dfHeatmap.index), 1), dfHeatmap.index)
		plt.subplots_adjust(left=0.15, right=0.98, top=0.98, bottom=0.25)
		#plt.tight_layout()
		plt.xticks(rotation=90,fontsize=hmfontsize) #fontsize=8)
		plt.yticks(ha='right',fontsize=hmfontsize) #fontsize=8)
		plt.xticks(np.arange(0.5, len(dfHeatmap.columns), 1), dfHeatmap.columns)
		fig.colorbar(mesh1)
		pltshow(MESSAGE_INFO,plt)
		plt.savefig("-".join(label_url) + '.phen-heatmap-by_trait_dataset.png',dpi=300)
		dfHeatmap.to_csv( "-".join(label_url) + '.phensrc2sample-by_trait_dataset.txt' , sep="\t")
		printdisplay(MESSAGE_INFO,"generated " + "-".join(label_url) + '.phensrc2sample-by_trait_dataset.txt')

		# reorder columns
		newcolumns=set()
		old2new=dict()
		for phensrc in dfHeatmap.columns:
			ps=phensrc.split('&')
			new=ps[1] + '&' + ps[0]
			newcolumns.add(new)
			old2new[phensrc]=new
		newcolumns=list(sorted(newcolumns))

		printdisplay(MESSAGE_DEBUG,'heatmap 3 -by_dataset_trait')
		dfHeatmap = dfHeatmap.rename(columns=old2new)
		dfHeatmap = dfHeatmap.reindex(columns=newcolumns)
		fig = plt.figure(figsize=(imgsize_xlarge_in[1], imgsize_xlarge_in[0]))
		mesh1=plt.pcolormesh(dfHeatmap, cmap=cmap, norm=pltcolors.LogNorm())
		plt.yticks(np.arange(0.5, len(dfHeatmap.index), 1), dfHeatmap.index)
		plt.subplots_adjust(left=0.15, right=0.98, top=0.98, bottom=0.25)
		#plt.tight_layout()
		plt.xticks(rotation=90,fontsize=hmfontsize)
		plt.yticks(ha='right',fontsize=hmfontsize)
		plt.xticks(np.arange(0.5, len(dfHeatmap.columns), 1), dfHeatmap.columns)
		fig.colorbar(mesh1)
		pltshow(MESSAGE_DEBUG,plt)
		plt.savefig("-".join(label_url) + '.phen-heatmap-by_dataset_trait.png',dpi=300)
		dfHeatmap.to_csv( "-".join(label_url) + '.phensrc2sample-by_dataset_trait.txt' , sep="\t")

		printdisplay(MESSAGE_INFO, "generated " +  "-".join(label_url) + '.phensrc2sample-by_dataset_trait.txt')



def get_phenotypes0(mydf):
	df_phen = mydf.loc[mydf['datatype'].str.startswith(TYPE_PHEN)]
	phens=[]
	for phenidx in df_phen.index.values.tolist():
		phens.append(str(df_phen.loc[phenidx]['property']))
	return phens


def phenotype_file(mydf, phen, outf,family_id='0'):
	
	df_phen = mydf.loc[(mydf['datatype']==TYPE_PHEN) & (mydf['property']==phen)]


	phenvals=[]
	with open(outf + '.txt' ,'w') as fout:
		samples=df_phen.columns[2:]
		for samn in samples:
			#fout.write(samn)
			for phenidx in df_phen.index.values.tolist():
				val=str(df_phen.loc[phenidx][samn])
				if val!="nan":
					if family_id:
						fout.write(family_id + "\t" + samn + "\t" + val +"\n")
					else:
						fout.write(samn + "\t" + samn + "\t" + val +"\n")
					phenvals.append(float(val))
			#fout.write("\n")
	return phenvals

def covariate_file(mydf, outf,family_id='0',covar=None):
	
	printdisplay(MESSAGE_DEBUG,mydf,'mydf')
	df_prop = mydf.loc[mydf['datatype'].isin([TYPE_ID,TYPE_PROP])]
	printdisplay(MESSAGE_DEBUG,df_prop,'df_prop')
	#properties=df_prop['property'].tolist()
	#printdisplay(MESSAGE_DEBUG,properties)
	df_prop=df_prop.set_index('property').drop('datatype',axis=1).transpose()
	properties=df_prop.columns.values.tolist()
	df_prop['IID']=df_prop.index
	df_prop['FID']=family_id
	df_prop=df_prop[['FID','IID']+covar] #properties]
	printdisplay(MESSAGE_DEBUG,df_prop,'df_prop')
	df_prop.to_csv(outf,sep="\t",index=False)




def to_hapmap0(mydf, outf):

	printdisplay(MESSAGE_DEBUG,'generating ' + outf + '.hmp ' + outf + '.map'  )
	osrm( outf + '.hmp')
	osrm( outf + '.map')
	with open(outf + '.hmp' ,'w') as fout,  open(outf + '.map' ,'w') as fmap:
		fout.write("rs#\talleles\tchrom\tpos\tstrand\tassembly#\tcenter\tprotLSID\tassayLSID\tpanelLSID\tQCcode");
		df_snp = mydf.loc[mydf['datatype'].str.startswith('5 SNP')]
		printdisplay(MESSAGE_DEBUG,'to_hapmap0')
		printdisplay(MESSAGE_DEBUG,df_snp.shape)
		printdisplay(MESSAGE_DEBUG,list(df_snp.columns.values))

		samples=df_snp.columns[2:]
		for samn in samples:
			fout.write("\t" + samn)
		fout.write("\n")

		#printdisplay(MESSAGE_DEBUG,c.index.values.tolist())
		#printdisplay(MESSAGE_DEBUG,'df_exp index ' + str(df_snp.index.values.tolist()))

		for pos in df_snp.index.values.tolist():
			#fout.write(samn + "\t")
			rowvalues=df_snp.loc[[pos]]
			#printdisplay(MESSAGE_DEBUG,'rowvalues=')
			#printdisplay(MESSAGE_DEBUG,rowvalues)
			allelestr=''
			allelestrout=""
			for samn in samples:

				al=str(rowvalues.loc[pos][samn]).replace("..",".").replace("nan",".")
				if '*' in al or '.' in al:
					al='.'

				allelestrout =  allelestrout + "\t" + al

			alleles=set(allelestrout)
			if "\t" in alleles:
				alleles.remove("\t")
			if "." in alleles:
				alleles.remove(".")
			chrpos=str(rowvalues.loc[pos]['property'])

			chrposarr=chrpos.split(":")
			if len(alleles)>1:
				fout.write( chrpos +  "\t" + "/".join(list(alleles)) +  "\t" + chrposarr[0] + "\t" + chrposarr[1] +"\t+\tNA\tNA\tNA\tNA\tNA\tNA" +  allelestrout + "\n")
				fmap.write(chrposarr[0] + "\t" + chrpos + "\t0\t" +  chrposarr[1] + "\n")




def to_ped0(mydf, outf):
	with open(outf + '.ped' ,'w') as fout, open(outf + '.map' ,'w') as fmap:
		fout.write("rs#\talleles\tchrom\tpos\tstrand\tassembly#\tcenter\tprotLSID\tassayLSID\tpanelLSID\tQCcode");
		df_snp = mydf.loc[mydf['datatype'].str.startswith('5 SNP')]

		samples=df_snp.columns[2:]
		for samn in samples:
			fout.write("\t" + samn)
		fout.write("\n")

		#printdisplay(MESSAGE_DEBUG,c.index.values.tolist())
		printdisplay(MESSAGE_DEBUG,'df_exp index ' + str(df_snp.index.values.tolist()))

		for pos in df_snp.index.values.tolist():
			#fout.write(samn + "\t")
			rowvalues=df_snp.loc[[pos]]
			#printdisplay(MESSAGE_DEBUG,'rowvalues=')
			#printdisplay(MESSAGE_DEBUG,rowvalues)
			allelestr=''
			allelestrout=""
			for samn in samples:
				#printdisplay(MESSAGE_DEBUG,samn )
				#printdisplay(MESSAGE_DEBUG, str(rowvalues.loc[pos][samn]).replace("..",".").replace("nan","."))
				#allelestrout =  allelestrout + "\t" + str(rowvalues[samn].loc[[pos]].replace("..",".").replace("NaN","."))
				allelestrout =  allelestrout + "\t" + str(rowvalues.loc[pos][samn]).replace("..",".").replace("nan",".")
				#allelestrout.append("\t" + rowvalues[samn].loc[[pos]].replace("..","."))
			#allelestrout="".join(allelestrout)

			#printdisplay(MESSAGE_DEBUG,'allelestrout=' + allelestrout)
			#alleles=set(allelestrout).remove("\t").remove(".")
			alleles=set(allelestrout)
			if "\t" in alleles:
				alleles.remove("\t")
			if "." in alleles:
				alleles.remove(".")
			chrpos=str(rowvalues.loc[pos]['property'])
			#printdisplay(MESSAGE_DEBUG,'chrpos')
			#printdisplay(MESSAGE_DEBUG,chrpos)
			if len(alleles)>1:
				chrposarr=chrpos.split(":")
				fout.write( chrpos +  "\t" + "/".join(list(alleles)) +  "\t" + chrposarr[0] + "\t" + chrposarr[1] +"\t+\tNA\tNA\tNA\tNA\tNA\tNA" +  allelestrout + "\n")
			#else:
			#	printdisplay(MESSAGE_DEBUG,"alleles in " + chrpos + " " + str(alleles))


def do_plink(df_snpphen, dflabel,datatype=TYPE_PHEN):

	#phens=get_phenotypes(c_withunit)
	#c_withunit.to_csv( keep_unit + '.tsv',  sep="\t")

	#to_hapmap(c_withunit, 'snp_' + keep_unit )

	#verify_samples0(df_snpphen)
	[df_clean_phen, df_clean_snp,df_clean_snpphen]=verify_samples0(df_snpphen, datatype, datatype2=TYPE_SNP, common_only=True)


	plinklabel=dflabel
	printdisplay(MESSAGE_DEBUG,'plinklabel=' + plinklabel)
	#to_hapmap0(df_snpphen, 'snp_' + plinklabel )
	to_hapmap0(df_clean_snp, 'snp_' + plinklabel )

	if not (os.path.exists("snp_" + plinklabel + ".map" ) and os.path.getsize("snp_" + plinklabel + ".map" ) > 0):
		return (None,None,None)

	ossystem("rm plink.bed")
	ossystem("rm plink.bim")
	ossystem("rm plink.fam")
	ossystem("rm snp_" + plinklabel + ".ped")
	ossystem("rm snp_" + plinklabel + ".bed")
	ossystem("rm snp_" + plinklabel + ".bim")
	ossystem("rm snp_" + plinklabel + ".fam")

	cmd="python HapMap_to_PLINK.py snp_" + plinklabel + ".hmp snp_" + plinklabel + ".map > snp_" + plinklabel + ".ped"
	printdisplay(MESSAGE_DEBUG,cmd)
	ossystem(cmd)
	cmd= plink_path + " --file " + "snp_" + plinklabel + " --allow-extra-chr --make-bed --out snp_" + plinklabel
	#cmd= plink_path +  "plink --file 50_samples --make-bed --out 50_samples_try
	printdisplay(MESSAGE_DEBUG,cmd)
	ossystem(cmd)
#	cmd="mv  plink.bed " + "snp_" + plinklabel + ".bed"
#	ossystem(cmd)
#	cmd="mv  plink.bim " + "snp_" + plinklabel + ".bim"
#	ossystem(cmd)
#	cmd="mv  plink.fam " + "snp_" + plinklabel + ".fam"
#	ossystem(cmd)

	return (df_clean_phen, df_clean_snp,df_clean_snpphen)





def do_plink0(df_snpphen, dflabel,datatype=TYPE_PHEN):

	#phens=get_phenotypes(c_withunit)
	#c_withunit.to_csv( keep_unit + '.tsv',  sep="\t")

	#to_hapmap(c_withunit, 'snp_' + keep_unit )

	#verify_samples0(df_snpphen)
	[df_clean_phen, df_clean_snp,df_clean_snpphen]=verify_samples0(df_snpphen, datatype, datatype2=TYPE_SNP, common_only=True)


	plinklabel=dflabel
	printdisplay(MESSAGE_DEBUG,'plinklabel=' + plinklabel)
	#to_hapmap0(df_snpphen, 'snp_' + plinklabel )
	to_hapmap0(df_clean_snp, 'snp_' + plinklabel )


	ossystem("rm plink.bed")
	ossystem("rm plink.bim")
	ossystem("rm plink.fam")
	ossystem("rm snp_" + plinklabel + ".ped")
	ossystem("rm snp_" + plinklabel + ".bed")
	ossystem("rm snp_" + plinklabel + ".bim")
	ossystem("rm snp_" + plinklabel + ".fam")

	cmd="python HapMap_to_PLINK.py snp_" + plinklabel + ".hmp snp_" + plinklabel + ".map > snp_" + plinklabel + ".ped"
	printdisplay(MESSAGE_DEBUG,cmd)
	ossystem(cmd)
	cmd= plink_path + " --file " + "snp_" + plinklabel + " --allow-extra-chr --make-bed --out snp_" + plinklabel
	#cmd= plink_path +  "plink --file 50_samples --make-bed --out 50_samples_try
	printdisplay(MESSAGE_DEBUG,cmd)
	ossystem(cmd)
#	cmd="mv  plink.bed " + "snp_" + plinklabel + ".bed"
#	ossystem(cmd)
#	cmd="mv  plink.bim " + "snp_" + plinklabel + ".bim"
#	ossystem(cmd)
#	cmd="mv  plink.fam " + "snp_" + plinklabel + ".fam"
#	ossystem(cmd)

	return (df_clean_phen, df_clean_snp,df_clean_snpphen)


import time

def jaccard(set1,set2):
	return len(set1.intersection(set2))*1.0/len(set1.union(set2))


def ped2tsv(pedfile,mapPos2Ref,mapSample2Group):
	#SRR10578251     SRR10578251     0       0       0       -9      G G     T T     G A     A A     T T     A A     C C     T T     G G     C C     G T     A A     G G     G G     A G     A A     T T     G T     G T     T T     C A     A G     A A     C C     T T     T T     G G     G G     A A     T C     C C     C C     C G     A T     C C     C C     G G     C C     A A     C C     T T     G G     G G     A A     A A     T T     T T     C C     T T     T T     C C     T T     T T     C C     T T     A T     T T     C C     G G     A A     T C     A T     A G     A C     C C     G A     G G     0 0     C T     T T     C C     T T     C A     A G     T T     G A     A A     T T     C C     A A     T T     G G     A A     0 0     T T     G G     C C     G G     T T     G G
	headers=[]
	refs=[]
	with open(pedfile + '.map') as fin:
		line=fin.readline().rstrip()
		while line:
			cols=line.split("\t")
			headers.append(cols[0] + '_' + cols[3])
			refs.append(mapPos2Ref[(cols[0],cols[3])])
			line=fin.readline().rstrip()
	
	with open(pedfile + '.ped') as fin, open(pedfile + '.ped.tsv','wt') as fout:
		outcols=['CS10 POSITIONS','ASSAY ID','ACCESSION','SUBPOPULATION','MISMATCH'] + headers
		fout.write("\t".join(outcols)+"\n")
		outcols=['CS10 ALLELES','','','',''] + refs
		fout.write("\t".join(outcols)+"\n")
		line=fin.readline().rstrip()
		while line:
			cols=line.split("\t")
			group=''
			if cols[0] in mapSample2Group:
				group=mapSample2Group[cols[0]]
			outcols=[cols[0],cols[0],'',group,'']
			for c in range(6,len(cols)):
				al12=cols[c]
				if al12[0]!=al12[2]:
					al12=al12.replace(" ","/")
				else: 
					al12=al12[0]
				outcols.append(al12)
			fout.write("\t".join(outcols)+"\n")
			line=fin.readline().rstrip()
	printdisplay(MESSAGE_INFO,pedfile + '.ped.tsv  generated')


def do_phylo_fromplink0(df_snpphen, dflabel,plink_file=None,cluster_group=None, convert_group=None,K=3,plink_extract=None,heatmap=False,drawtree=False,fastapath=None): #,unit,phenotypes=None,assocmethod='qassoc.counts',maxp=None,topn=None,plink_file=None,annot=None,df_genes=None,covar=None,plotgwas=True, plotgwasi=False,pfilter=None):

	plinklabel=dflabel
	
	if plink_file:
		plinkpref=plink_file
	else:
		plinkpref="snp_" + plinklabel

	#convert_group={'BLDT':'Drug','NLDT':'Drug',',unknown':'unknown','Hemp-type':'Hemp','Drug-type':'Drug','Drug type feral':'Drug','Type I':'Drug','Type III':'Hemp'}
	printdisplay(MESSAGE_DEBUG,"do_phylo_fromplink0 using " + plinklabel)

	samples=df_snpphen.columns.values.tolist()
	samples.remove('datatype')
	samples.remove('property')
	mapSample2Group=dict()
	mapSample2ConvertGroup=dict()
	mapGroup2Sample=dict()
	printdisplay(MESSAGE_DEBUG,'samples=' + str(len(samples)))

	

	with open(plinklabel + '_keep.txt','w') as fout:
		for s in samples:
			#print(s)
			#print(df_snpphen.loc[ df_snpphen['property']=='cultivar_group','property'])
			#print(df_snpphen.loc[df_snpphen['property']=='cultivar_group',[s]]  )
			#print(df_snpphen.loc[df_snpphen['property']=='cultivar_group',[s]].values[0][0]  )
			group=df_snpphen.loc[df_snpphen['property']==cluster_group,[s]].values[0][0] 
			mapSample2Group[ s ]=group

			if convert_group and group in convert_group:
				group=convert_group[group]
			if not group in mapGroup2Sample:
				print('group ' + group)
				mapGroup2Sample[group]=set()

			#mapSample2Group[ s ]=group
			mapGroup2Sample[group].add(s)
			mapSample2ConvertGroup[s]=group
			#mapGroup2Sample[]
			fout.write("0 "+ s + "\n")


	#display(mapSample2Group)
	#display(mapGroup2Sample)

	extractstr=''
	if plink_extract:
		with open(plinklabel + '_extract.txt','w') as fout:
			for i in plink_extract:
				fout.write(i[0] + " " + i[1] + " "  + i[2] + " "  + i[3] + "\n" )
		extractstr=' --extract range ' + plinklabel + '_extract.txt '

	mapCluster2Sample=dict()
	if False:
		cmd=  plink_path + " --bfile " + plinkpref  + " --cluster group-avg --distance ibs --allow-extra-chr --out " + plinklabel + "  --keep " + plinklabel + "_keep.txt --K " + str(K) + extractstr
		#printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)
		#osmv('plink.cluster2 ' + plinklabel + '.cluster2')

		with open(plinklabel + '.cluster2') as fin:
			line=fin.readline().rstrip()
			while line:
				cols=line.split()
				clus=cols[2]
				if not clus in mapCluster2Sample:
					mapCluster2Sample[clus]=set()
				mapCluster2Sample[clus].add(cols[1])
				line=fin.readline().rstrip()
	else:
		if not os.path.exists(plinklabel + '.bim'):
			cmd=plink_path + ' --bfile ' + plinkpref + ' --allow-extra-chr --out ' + plinklabel  + ' --keep ' + plinklabel + '_keep.txt --make-bed ' +  extractstr
			ossystem(cmd)
		if os.path.exists(plinklabel + '.bim'):
			ossystem('Rscript geno_heatmap_cluster.R ' + plinklabel + ' ' + str(K))
			with open(plinklabel + '.groups.txt') as fin:
				line=fin.readline().rstrip()
				line=fin.readline().rstrip()
				while line:
					cols=line.split()
					clus=cols[1]
					if not clus in mapCluster2Sample:
						mapCluster2Sample[clus]=set()
					mapCluster2Sample[clus].add(cols[0].replace("\"",""))
					line=fin.readline().rstrip()
		else:
			printdisplay(MESSAGE_INFO,'No variant for ' + plinklabel)

	#display(mapCluster2Sample)


	
	mapGroupClust2Jaccard=dict()
	for g in mapGroup2Sample:
		if True: #convert_group and g in convert_group: #['Hemp','Drug','Type II','Bassal cannabis']:
			for c in mapCluster2Sample:
				mapGroupClust2Jaccard[(g,c)]=(jaccard( mapGroup2Sample[g],  mapCluster2Sample[c]), len(mapGroup2Sample[g]),len(mapCluster2Sample[c]))
	#print('hello')
	#display('Jaccard')
	#display(mapGroupClust2Jaccard)

	(vi,nvi)=variation_of_information(list( mapGroup2Sample.values()), list(mapCluster2Sample.values() ))
	printdisplay(MESSAGE_INFO, 'Variation of Information=' + str(vi) + '  Normalized VI=' + str(nvi))

	#(chistat,chipval)=chisquare(list( mapGroup2Sample.values()), list(mapCluster2Sample.values() ) )
	#printdisplay(MESSAGE_INFO, 'Chisquare statistics=' + str(chistat) + '  pvalue=' + str(chipval))
	(chistat,chipval)=chisquare_contingency0(mapGroup2Sample,mapCluster2Sample)
	printdisplay(MESSAGE_INFO, 'Chisquare statistics=' + str(chistat) + '  pvalue=' + str(chipval))

	if drawtree:
		
		with open(plinklabel+'.sample2group.txt','wt') as fout:

			for s in mapSample2ConvertGroup:
				fout.write(s + "\t" + mapSample2ConvertGroup[s]+"\n")
		cmd='python draw-tree-3.py ' +  plinklabel +'.newick'
		print(cmd)
		ossystem(cmd)
		#pltshow(MESSAGE_INFO,plinklabel + '.tree.bytype.png')
		#pltshow(MESSAGE_INFO,plinklabel + '.tree.bysource.png')
		pltshow(MESSAGE_INFO,plinklabel + '.newick.tree.png')


	bedpath='bedtools'
	#heatmap=True
	mapPos2Ref=dict()
	if heatmap and fastapath:
		# get reference
		snppos=[]
		with open(plinklabel+'.bim') as fin, open(plinklabel+'.btbed','wt') as fout:
			line=fin.readline()
			while line:
				cols=line.split()
				fout.write(cols[0] +"\t"+ str(int(cols[3])-1) + "\t" + cols[3] + "\n")
				snppos.append((cols[0],cols[3]))
				line=fin.readline()
	
		ossystem(bedpath + ' getfasta -fi ' + fastapath + ' -bed ' + plinklabel+'.btbed > ' +  plinklabel+'.bed.ref')

		with open(plinklabel+'.bed.ref') as fin:
			line=fin.readline()
			i=0
			while line:
				if not line.startswith(">"):
					mapPos2Ref[snppos[i]]=line.upper()
					i+=1
				line=fin.readline().strip()

		#display('mapPos2Ref=' + str(mapPos2Ref))

		cmd=plink_path + ' --bfile ' + plinklabel + ' --recode tab --out ' + plinklabel + '_ped --allow-extra-chr'
		print(cmd)
		ossystem(cmd)
		#SRR10578251     SRR10578251     0       0       0       -9      G G     T T     G A     A A     T T     A A     C C     T T     G G     C C     G T     A A     G G     G G     A G     A A     T T     G T     G T     T T     C A     A G     A A     C C     T T     T T     G G     G G     A A     T C     C C     C C     C G     A T     C C     C C     G G     C C     A A     C C     T T     G G     G G     A A     A A     T T     T T     C C     T T     T T     C C     T T     T T     C C     T T     A T     T T     C C     G G     A A     T C     A T     A G     A C     C C     G A     G G     0 0     C T     T T     C C     T T     C A     A G     T T     G A     A A     T T     C C     A A     T T     G G     A A     0 0     T T     G G     C C     G G     T T     G G
		ped2tsv(plinklabel + '_ped',mapPos2Ref,mapSample2Group)

		cmd='python draw_tree_matrix.py ' + plinklabel + '_ped.ped.tsv ' + plinklabel +'.newick'
		print(cmd)
		ossystem(cmd)
		pltshow(MESSAGE_INFO,'type_snpseektree_'+ dflabel + '.newick.rename.nwk.rr.png')



	return (vi, nvi, mapGroupClust2Jaccard, mapGroup2Sample,  mapCluster2Sample,chistat,chipval)



	if False: #not os.path.exists(plinklabel + '.dist'):
	
		#	--distance [{square | square0 | triangle}] [{gz | bin | bin4}] ['ibs'] ['1-ibs'] ['allele-ct'] ['flat-missing']
		#cmd=  plink_path + " --bfile " + plinkpref  + " --cluster --distance square0 --allow-extra-chr"
		cmd=  plink_path + " --bfile " + plinkpref  + " --cluster --distance square0 --allow-extra-chr"
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)
		osmv('plink.dist ' + plinklabel + '.dist')


		pd_dist=pd.read_csv(plinklabel + '.dist',sep="\t",header=None)
		display(pd_dist)
		#dm=pd_dist.to_numpy()
		dm=[]
		ri=0
		for r in pd_dist.values.tolist():
			col=[]
			for j in range(ri+1):
				col.append(r[j])
			dm.append(col)
			ri+=1


		# Importing necessary libraries from BioPython
		from Bio import Phylo, AlignIO
		from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor, DistanceMatrix

		#display(df_snpphen)
		distMatrix= DistanceMatrix(samples, matrix=dm)
		
		# NumPy 2D array
		# Construct the phlyogenetic tree using UPGMA algorithm
		constructor = DistanceTreeConstructor()
		UPGMATree = constructor.upgma(distMatrix)# Construct the phlyogenetic tree using NJ algorithm
		Phylo.draw(UPGMATree)
		#Phylo.write([tree1, tree2], "example-both.xml", "phyloxml")
		Phylo.write(UPGMATree, dflabel + '.upgma.newick', "newick")
		NJTree = constructor.nj(distMatrix)
		#Phylo.draw(NJTree)
		Phylo.write(NJTree, dflabel + '.nj.newick', "newick")


def do_clustering0(df_snp,dflabel,group,convert_group=dict(),K=3,heatmap=False,plink_file=None,plink_extract=[],recalc=False,rerunplink=False,fastapath=None):
	df_hasgroup=df_snp.set_index('property',drop=False)
	df_hasgroup=df_hasgroup.loc[:, df_hasgroup.loc[group].isin([TYPE_PROP,group] + list(convert_group.keys()))]
	df_hasgroup=df_hasgroup.reset_index(drop=True)
	#display(df_hasgroup)
	#convert_group={'BLDT':'Drug','NLDT':'Drug',',unknown':'unknown','Hemp-type':'Hemp','Drug-type':'Drug','Drug type feral':'Drug','Type I':'Drug','Type III':'Hemp'}
	#return do_phylo(df_hasgroup, 'cluster_' + reg,K=K,convert_group=convert_group,plink_file=plink_file,plink_extract=plink_extract,heatmap=heatmap)
	return do_phylo0(df_hasgroup,'cluster_' + dflabel,plink_file=plink_file,recalc=recalc,rerunplink=rerunplink,K=K,cluster_group=group,convert_group=convert_group,plink_extract=plink_extract,heatmap=heatmap,fastapath=fastapath) #unit,phenotypes=phenotypes,assocmethod=assocmethod,rerunplink=rerunplink,maxp=maxp,plink_file=plink_file,annot=annot,recalc=recalc,df_genes=df_genes,covar=covar,plotgwas=plotgwas, plotgwasi=plotgwasi,pfilter=pfilter)


def do_gwas_fromplink0(df_snpphen, dflabel,unit,phenotypes=None,assocmethod='qassoc.counts',maxp=None,topn=None,plink_file=None,annot=None,df_genes=None,covar=None,plotgwas=True, plotgwasi=False,pfilter=None):

	plinklabel=dflabel

	printdisplay(MESSAGE_DEBUG,"do_gwas_fromplink0 using " + plinklabel)
	#printdisplay(MESSAGE_DEBUG,keep_unit)
	#printdisplay(MESSAGE_DEBUG,plink_path)

	#phens=get_phenotypes(c_withunit)
	phens=get_phenotypes0(df_snpphen)

	if phenotypes:
		phens=list(set(phens).intersection(set(phenotypes)))

	#plink_path="plink"
	#outpfile='.qassoc'
	#outpfile='.assoc.linear'
	#outpfile='.P1.assoc.linear'
	#outpfile='.logistic'

	pfilterstr=''
	if pfilter:
		pfilterstr=' --pfilter ' + str(pfilter)
	allpheno=" --all-pheno "
	allpheno=''
	phenmethod='.' + assocmethod
	outpfile='.qassoc'
	covarstr=''
	covfile='cov_' + unit + '_' + plinklabel + '.tsv'

	if phenmethod=='.logistic':
		outpfile='.assoc.linear'
		if covar:
			covarstr=' --covar-name ' + ", ".join(covar) + " --covar " + covfile
	if phenmethod=='.linear':
		outpfile='.assoc.linear'
		if covar:
			covarstr=' --covar-name ' + ", ".join(covar) + " --covar " +  covfile
	if phenmethod=='.qassoc.counts':
		outpfile='.qassoc'
		if covar:
			printdisplay(MESSAGE_INFO,'covar only works with linear and logistic methods')


	#phenmethod='.logistic'
	#outpfile='.assoc.linear'

	#phenmethod='.linear'
	#outpfile='.assoc.linear'

	#phenmethod='.qassoc.counts'
	#outpfile='.qassoc'

	if phenmethod=='.logistic':
		allpheno=" --all-pheno "

	if allpheno:
		outpfile='.P1' + outpfile

	qqman_plots=[]
	assocpoutfiles=[]
	osrm("*" + outpfile + '.done')

	for p in phens:
		prep=p.replace(" ","_")
		qqman_plots.append(prep)

	if not os.path.exists(dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.png'):
		qqman_plots=[]
		if covar:
			covariate_file(df_snpphen,  covfile ,covar=covar)
		for p in phens:

			prep=p.replace(" ","_")

			printdisplay(MESSAGE_DEBUG,'GWAS ' + prep)
			printdisplay(MESSAGE_DEBUG,'keep_unit=' +str(unit))
			printdisplay(MESSAGE_DEBUG,'plinklabel=' +str(plinklabel))
			printdisplay(MESSAGE_DEBUG,'prep=' +str(prep))
			

			#print('GWAS ' + prep)
			#print('keep_unit=' +str(keep_unit))
			#print('plinklabel=' +str(plinklabel))
			#print('prep=' +str(prep))

			phenfile='phen_' + prep + '_' + unit + '_' + plinklabel 
			#phenfile='phen_' + prep  + '_' + plinklabel 
			phenotype_file(df_snpphen,  p, phenfile )
			#phenotype_file(c_changeunits, keep_unit, p, 'phen_' + prep + '_' + keep_unit )



			#ossystem("rm assoc_" + prep + "_" + keep_unit + '_' + plinklabel) 
			assocfile="assoc_" + prep + "_" + unit  + '_' + plinklabel + phenmethod # ".qassoc"
			try:
				osrm( assocfile + '*')
			except:
				pass

			plinkpref="snp_" + plinklabel
			if plink_file:
				plinkpref=plink_file

			cmd=""

			if not os.path.exists(assocfile + outpfile + '.done'):

				osrm(assocfile + outpfile + '.done')
				if phenmethod=='.qassoc.counts':
					cmd=  plink_path + " --bfile " + plinkpref + " " + allpheno + " --assoc counts --pheno  " + phenfile  +  ".txt --out " + assocfile + pfilterstr + " --allow-extra-chr --allow-no-sex && touch  " + assocfile + outpfile + '.done &> ' + assocfile + outpfile + '.out  &'
				if phenmethod=='.logistic':
					cmd=  plink_path + " --bfile  " + plinkpref + " " + allpheno + " --logistic " + covarstr + " --pheno  " + phenfile  + ".txt --out " + assocfile  + pfilterstr  + " --allow-extra-chr --allow-no-sex && touch  " + assocfile + outpfile + '.done &> ' + assocfile + outpfile + '.out  &'
				if phenmethod=='.linear':
					cmd=  plink_path + " --bfile  " + plinkpref + " " + allpheno + " --linear  " + covarstr + " --pheno  " + phenfile + ".txt --out " + assocfile  + pfilterstr  + " --allow-extra-chr --allow-no-sex && touch  " + assocfile + outpfile + '.done &> ' + assocfile + outpfile + '.out  &'


				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)



			assocpoutfiles.append((assocfile + outpfile, prep))


		# wait all jobs to finish
		maxmins=60*72
		mincnt=0
		alldone=True

		printdisplay(MESSAGE_INFO,"waiting for gwas runs")
		while mincnt<maxmins:
			donecnt=0
			for assocfile in assocpoutfiles:
				if os.path.exists(assocfile[0] + '.done'):
					donecnt+=1

			printdisplay(MESSAGE_DEBUG,str(mincnt) + ' mins. ' + str(donecnt) + '/' + str(len(assocpoutfiles)) + " done.")
			if donecnt==len(assocpoutfiles):
				break

			mincnt+=0.5
			time.sleep(30)

		if alldone:
			printdisplay(MESSAGE_INFO,"All gwas plink done")
		else:
			printdisplay(MESSAGE_INFO,"Not all gwas plink done")
			exit()


		for assocpoutfile in assocpoutfiles:
			try:
				#qqman.manhattan(assocfile, ax=axes_mp[prow,pcol],title=prep,cmap=plt.get_cmap("jet"),cmap_var=10 , suggestiveline=False, genomewideline=False)
				if os.path.exists(assocpoutfile[0]):
					if plotgwasi:
						qqman.manhattan(assocpoutfile[0], out=assocpoutfile[0] + '.png',genomewideline = -math.log10(maxp))
					qqman_plots.append(assocpoutfile[1])
				else:
					printdisplay(MESSAGE_INFO,assocpoutfile[0] + ' not found')
			except Exception as e:
				printdisplay(MESSAGE_DEBUG,e)

		printdisplay(MESSAGE_INFO,'qqman_plots=' + str(qqman_plots))

		pcount=0
		ncols=1
		#fig_mp, axes_mp = plt.subplots(nrows=int(len(qqman_plots)/ncols)+1, ncols=ncols, figsize = (40,30))
		if plotgwas:
			if len(qqman_plots)>1:
				fig_mp, axes_mp = plt.subplots(nrows=len(qqman_plots), ncols=ncols, figsize = (20, len(qqman_plots)*2))
				for prep in  sorted(qqman_plots):
					#assocfile="assoc_" + prep + "_" + keep_unit + outpfile #".qassoc"
					assocfile="assoc_" + prep + "_" + unit  + '_' + plinklabel + phenmethod
					#prow=int(pcount/ncols)
					#pcol=pcount % ncols
					#qqman.manhattan(assocfile, ax=axes_mp[prow,pcol],title=prep,cmap=plt.get_cmap("jet"),cmap_var=10 , suggestiveline=False, genomewideline=False)
					if os.path.exists(assocfile + outpfile):
						qqman.manhattan(assocfile + outpfile, ax=axes_mp[pcount],title=prep,cmap=plt.get_cmap("jet"),cmap_var=10, genomewideline = -math.log10(maxp)) # , suggestiveline=False, genomewideline=False)
						pcount+=1
					else:
						printdisplay(MESSAGE_INFO,'No file ' + assocfile + outpfile)
					#ossystem("mv " + assocfile + outpfile+ " " +  plinklabel + "_" + assocfile)

				fig_mp.tight_layout()
				plt.savefig(dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.png', format="png")
				#pltshow(MESSAGE_INFO,plt)
				plt.clf()
				plt.close()
				printdisplay(MESSAGE_INFO,'image generated ' + dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.png')

			elif len(qqman_plots)==1:
				assocfile="assoc_" + qqman_plots[0] + "_" + keep_unit  + '_' + plinklabel + phenmethod
				qqman.manhattan(assocfile + outpfile, out=dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.png', genomewideline = -math.log10(maxp))
				plt.savefig(dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.png', format="png")
				#pltshow(MESSAGE_INFO,plt)
				plt.clf()
				plt.close()
				printdisplay(MESSAGE_INFO,'image generated ' + dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.png')
			else:
				printdisplay(MESSAGE_INFO,'image Not generated ' + dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.png')
	
	if plotgwas:
		pltshow(MESSAGE_INFO,dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.png')


	if not os.path.exists(dflabel+'.gwas.top.tsv'):

		with open( dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.tsv', 'wt' ) as fout:
			fout.write("\t".join(['PHEN','CHR','SNP','BP','NMISS','BETA','SE','R2','T','P']) + "\n")
			for prep in  sorted(qqman_plots):
				assocpoutfile="assoc_" + prep + "_" + unit  + '_' + plinklabel + phenmethod  + outpfile
				if os.path.exists(assocpoutfile):
					with open(assocpoutfile ) as fin:
						fin.readline()
						line=fin.readline().strip()
						while line:
							cols=line.split()
							if maxp and  not (cols[8]=='NA') and float(cols[8])<=maxp:
								#cols[2]=cols[1] + ":" + cols[3].zfill(9)
								cols[1]=cols[0] + ":" + cols[2].zfill(9)
								fout.write("\t".join( [prep] + cols ) + "\n" )
							line=fin.readline().strip()
					printdisplay(MESSAGE_DEBUG,'Added file ' + assocpoutfile)
				else:
					printdisplay(MESSAGE_INFO,'No file ' + assocpoutfile)

		df_allassoc=pd.read_csv( dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod + '.tsv', index_col=None,sep="\t")
		if df_allassoc.shape[0]>0:
			if maxp:
				df_allassoc=df_allassoc[df_allassoc['P']<=maxp]
			df_allassoc=df_allassoc.sort_values(['P','SNP','PHEN'])
			if topn:
				df_allassoc=df_allassoc.head(topn)
			
			#df_allassoc.to_csv( dflabel + '_gwasmp_' + keep_unit + '_' + plinklabel + phenmethod + '.top.tsv',sep="\t",index=False)

			if annot:

				df_allassoc_orig=df_allassoc.copy()
				snplist=sorted(list(set(df_allassoc['SNP'].tolist())))

				if len(snplist)>0:

					#if not df_genes is None:
					#	df_genes=clean_df_gene(df_genes)
					#else:
					dfgenes=get_genes_by_position0(snplist,annot=annot)

					# aadd gene column to snps
					#do_snps_getgenes(df_allassoc,dfgenes,annot='all',outfile=dflabel + '_gwasmp_' + keep_unit + '_' + plinklabel + phenmethod + '.top.geneannot.tsv')
					printdisplay(MESSAGE_DEBUG,'df_allassoc= ' + str(df_allassoc.columns.values.tolist()))
					printdisplay(MESSAGE_DEBUG,'dfgenes= ' + str(dfgenes.columns.values.tolist()))
					#df_allassoc=do_snps_getgenes(df_allassoc,dfgenes,  outfile=dflabel + '.gwas.top.snpgene.tsv')
					df_allassoc=do_snps_getgenes(df_allassoc,dfgenes,  outfile=dflabel + '.gwas.top.snpgene.tsv',snpcolumn='SNP', other_columns=['PHEN','P'])
					


					printdisplay(MESSAGE_DEBUG,'df_allassoc= ' + str(df_allassoc.columns.values.tolist()))
					df_allassoc=df_allassoc[["trait","snps","pvalue",'snpgenes','note','goterm','iprterm']]
				else:
					printdisplay(MESSAGE_INFO,'No SNPs pass the secondary filter maxp='+str(maxp) + ' topn=' + str(topn) + ' for ' + dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod)
					minp=min(df_allassoc_orig['P'].tolist())
					printdisplay(MESSAGE_INFO,"Minimum p available is " + str(minp))
		else:
			printdisplay(MESSAGE_INFO,'No SNPs pass the primary filter (plink run cutoff) for ' + dflabel + '_gwasmp_' + unit + '_' + plinklabel + phenmethod)


		df_allassoc=df_allassoc.rename(columns={"PHEN":"trait","SNP":"snps","P":"pvalue"})
		df_allassoc.to_csv(dflabel+'.gwas.top.tsv',sep="\t",index=False)
		#printdisplay(MESSAGE_INFO,'generated ' + dflabel+'.gwas.top.tsv')
	else:
		df_allassoc=pd.read_csv(dflabel+'.gwas.top.tsv',sep="\t")

	if os.path.exists(dflabel + '.gwas.top.snpgene.tsv'):
		printdisplay(MESSAGE_INFO, dflabel+'.gwas.top.tsv','top snp-trait ' + dflabel+'.gwas.top.snpgene.tsv')
	else:
		printdisplay(MESSAGE_INFO, dflabel+'.gwas.top.tsv','top snp-trait ' + dflabel+'.gwas.top.tsv')

	return df_allassoc

#CHR                     SNP         BP    NMISS       BETA         SE         R2        T            P
#NC_044370.1   NC_044370.1:014625075   14625075       12    0.01667     0.4962  0.0001128  0.03359       0.9739
#NC_044370.1   NC_044370.1:014633248   14633248        9      -0.15     0.7092    0.00635  -0.2115       0.8385



#name    contig  fmin    fmax    strand  gb_keyword      go_term interpro_value
#nad1    NC_029855.1     11633   12020   -1

def clean_df_gene(df):
	df_xm=df[df['name'].str.startswith('XM')]
	df_loc=df[df['name'].str.startswith('LOC')]
	df_xmloc=pd.merge(df_xm,df_loc, on=['contig','fmin','fmax','strand'],how='left')

	#print(df_xmloc.columns.values.tolist())
	df_xmloc = df_xmloc.fillna('')
	for c in ['gb_keyword','go_term','interpro_value']:
		df_xmloc[c]=df_xmloc[c + '_x'] + ' ' + df_xmloc[c + '_y']
		df_xmloc[c]=df_xmloc[c].str.strip()

	df_xmloc=df_xmloc[['name_x','contig','fmin','fmax','strand', 'name_y','gb_keyword', 'go_term', 'interpro_value']]
	df_xmloc=df_xmloc.rename(columns={'name_x':'gene','name_y':'locus'})

	return df_xmloc



def do_snps_getgenes(dfsnp, dfgenes,outfile,snpcolumn='SNP',snpset=None,other_columns=[],requery=False,how='inner'):
	
	if requery or outfile is None or not os.path.exists(outfile):

		dfgenes=clean_df_gene(dfgenes)
		#df_snpgene=pd.merge(dfsnp,dfgenes,left_on=)

		dfsnp[['contig','pos']]=dfsnp[snpcolumn].str.split(":",expand=True)
		#df[['V','allele']] = df['V'].str.split('-',expand=True)
		dfsnp = dfsnp.astype({'pos': 'int'})
		dfgenes = dfgenes.astype({'fmin': 'int', 'fmax': 'int'})

		printdisplay(MESSAGE_DEBUG,'getting genes for snps ' + str(dfsnp.shape) + '  ' + str(dfsnp.columns.values.tolist()))
		printdisplay(MESSAGE_DEBUG,'getting gene data ' + str(dfgenes.shape) + '  ' + str(dfgenes.columns.values.tolist()))

		#t1=dfsnp
		#t2=dfgenes


		conn = sqlite3.connect(':memory:')
		connCursor = conn.cursor()
		dfsnp.to_sql('snp', conn, index=False)
		dfgenes.to_sql('gene', conn, index=False)
		
		connCursor.execute('create index idx_snp_contig on snp(contig)')
		connCursor.execute('create index idx_snp_pos on snp(pos)')
		connCursor.execute('create index idx_gene_contig on gene(contig)')
		connCursor.execute('create index idx_gene_fminfmax on gene(fmin,fmax)')


		#df_snpgne = ps.sqldf("select * from snp as s, gene as g where g.contig=s.contig and g.fmin<=s.pos",locals())
		df_snpgne=None
		if how=='inner':
			df_snpgne = pd.read_sql_query("select * from snp as s, gene as g where g.contig=s.contig and g.fmin<=s.pos and g.fmax>=s.pos", conn)
		elif how=='left':
			df_snpgne = pd.read_sql_query("select * from snp as s left join gene as g on g.contig=s.contig and g.fmin<=s.pos and g.fmax>=s.pos", conn)

		#df_snpgne ['snps', 'expgenes', 'pvalue', 'contig', 'pos', 'gene', 'contig', 'fmin', 'fmax', 'strand', 'locus', 'gb_keyword', 'go_term', 'interpro_value']
		printdisplay(MESSAGE_DEBUG,'df_snpgne ' + str(df_snpgne.shape) + ' ' + str(df_snpgne.columns.values.tolist()))
		
		df_snpgne=df_snpgne[ other_columns + [snpcolumn] + ['gene','locus','gb_keyword','go_term', 'interpro_value']]
		df_snpgne=df_snpgne.rename(columns={'PHEN':'trait',snpcolumn:'snps','P':'pvalue','gene':'snpgenes','gb_keyword':'note','go_term':'goterm', 'interpro_value':'iprterm'})


		dfsnp=df_snpgne
		if outfile:
			df_snpgne.to_csv(outfile,sep="\t",index=False)
			printdisplay(MESSAGE_DEBUG,'generated ' + outfile)
	else:
		if outfile:
			dfsnp=pd.read_csv(outfile,sep="\t")
			printdisplay(MESSAGE_DEBUG,'loaded ' + outfile)
	return dfsnp




def do_gwas0(df_snpphen, dflabel, unit,phenotypes=None,assocmethod='qassoc.counts',rerunplink=True,maxp=1e-5,plink_file=None,annot=None,recalc=False,df_genes=None,covar=None,plotgwas=True, plotgwasi=False,pfilter=None):

	if recalc:
		clean_project(dflabel)
	#phens=get_phenotypes(c_withunit)
	#c_withunit.to_csv( keep_unit + '.tsv',  sep="\t")

	#to_hapmap(c_withunit, 'snp_' + keep_unit )
	plinklabel=dflabel

	df_clean_snpphen=df_snpphen
	if plink_file is None:
		if rerunplink or not (os.path.exists("snp_" + plinklabel + ".bed") and  os.path.exists("snp_" + plinklabel + ".fam")  and os.path.exists("snp_" + plinklabel + ".bim")):
			 (dummy, dummy,df_clean_snpphen)=do_plink(df_snpphen, dflabel)
		#plink_file="snp_" + plinklabel 
	else:
		df_clean_snpphen=get_plink_common_samples(df_snpphen,TYPE_PHEN,plink_file)

		#[dummy, dummy,df_clean_snpphen]=verify_samples0(df_snpphen, TYPE_PHEN, datatype2=TYPE_SNP, common_only=True)


	return do_gwas_fromplink0(df_clean_snpphen, dflabel,unit=unit,phenotypes=phenotypes,assocmethod=assocmethod,maxp=maxp,plink_file=plink_file,annot=annot,df_genes=df_genes,covar=covar,plotgwas=plotgwas, plotgwasi=plotgwasi,pfilter=pfilter)


def do_phylo0(df_snpphen, dflabel,plink_file=None,recalc=False,rerunplink=False,K=None,cluster_group=None,convert_group=None,plink_extract=None,heatmap=False,drawtree=False,fastapath=None): # , unit,phenotypes=None,assocmethod='qassoc.counts',rerunplink=True,maxp=1e-5,plink_file=None,annot=None,recalc=False,df_genes=None,covar=None,plotgwas=True, plotgwasi=False,pfilter=None):

	if recalc:
		clean_project(dflabel)
	#phens=get_phenotypes(c_withunit)
	#c_withunit.to_csv( keep_unit + '.tsv',  sep="\t")

	#to_hapmap(c_withunit, 'snp_' + keep_unit )
	plinklabel=dflabel

	df_clean_snpphen=df_snpphen
	if plink_file is None:
		if rerunplink or not (os.path.exists("snp_" + plinklabel + ".bed") and  os.path.exists("snp_" + plinklabel + ".fam")  and os.path.exists("snp_" + plinklabel + ".bim")):
			(dummy1, dummy2,df_clean_snpphen)=do_plink(df_snpphen, dflabel,datatype=TYPE_PROP)
			#display(dummy2)
			if df_clean_snpphen is None:
				display(MESSAGE_INFO,'No SNP entry')
				return
			if dummy1.shape[0]==0:
				display(MESSAGE_INFO,'No ' + TYPE_PROP + ' entry')
				return
		plink_file="snp_" + plinklabel 
	else:
		df_clean_snpphen=get_plink_common_samples(df_snpphen,TYPE_PROP,plink_file)
		printdisplay(MESSAGE_DEBUG,df_clean_snpphen,'df_clean_snpphen')

		#[dummy, dummy,df_clean_snpphen]=verify_samples0(df_snpphen, TYPE_PHEN, datatype2=TYPE_SNP, common_only=True)


	return do_phylo_fromplink0(df_clean_snpphen, dflabel,plink_file=plink_file,cluster_group=cluster_group,convert_group=convert_group,K=K,plink_extract=plink_extract,heatmap=heatmap,drawtree=drawtree,fastapath=fastapath) #,unit=unit,phenotypes=phenotypes,assocmethod=assocmethod,maxp=maxp,plink_file=plink_file,annot=annot,df_genes=df_genes,covar=covar,plotgwas=plotgwas, plotgwasi=plotgwasi,pfilter=pfilter)




def do_recode_plink(df_snpphen, dflabel,recode='AD',recodeplink=False,transpose='transpose',from_plink=None,datatype=TYPE_SNP):

	if from_plink:
		cmd=plink_path + " --bfile " + from_plink  + " --recode " + recode + " --out snp_" + dflabel + "_recode" + recode.replace("-","") + ' --allow-extra-chr'
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)
	else:
		if recodeplink or not (os.path.exists("snp_" + dflabel + ".bed") and  os.path.exists("snp_" + dflabel + ".fam")  and os.path.exists("snp_" + dflabel + ".bim")):
			do_plink(df_snpphen, dflabel,datatype=datatype)

		cmd=plink_path + " --bfile snp_" + dflabel + " --recode " + recode + " --out snp_" + dflabel + "_recode" + recode.replace("-","") + ' --allow-extra-chr'
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)


#def do_matrixeqtl_getgenes(infile,dflabel,suf="",maxp=0.00001,genes_topn=100,snp_topn=10000,annot='all'):
#def do_matrixeqtl_getgenes(infile,dflabel,suf="",maxp=0.00001,genes_topn=None,snp_topn=None,topn=None,annot='all'):
def do_matrixeqtl_getgenes(infile,dflabel,maxp=None,genes_topn=None,snp_topn=None,topn=None,suf="",annot='all'):


	df_eqtl=pd.read_csv(infile, sep=" ")

	if maxp:
		df_eqtl=df_eqtl[df_eqtl['pvalue']<=maxp]
	df_eqtl=df_eqtl.sort_values(['pvalue'])

	if topn:
		df_eqtl=df_eqtl.head(topn)

	pospass=set()
	genepass=set()
	icnt=0
	passindex=[]
	#"snps" "gene" "statistic" "pvalue" "FDR" "beta"
	#"1" "NC_044371.1:090029695" "XM_030650687.1" 10.2867347358058 5.97978559660231e-15 7.67825900821662e-05 147.211909090909

	issnpposallele=False
	issnppos=False
	firstrow=True
	for index, row in df_eqtl.iterrows():
		if firstrow:
			#try first
			if len(row['snps'].split(":")[1].split("_"))>1:
				issnpposallele=True
			else:
				issnppos=True
			firstrow=False

		if issnpposallele:
			snppos=row['snps'].split("_")[:-1]
			ctgpos="_".join(snppos).split(":")
			#printdisplay(MESSAGE_DEBUG,ctgpos)
			#pospass.add(ctgpos[0] + ":" + ctgpos[1] + "-" + str(int(ctgpos[1])+1))
			pospass.add(ctgpos[0] + ":" + ctgpos[1])
		else:
			pospass.add(row['snps'])

		genepass.add(row['gene'])
		passindex.append(index)
		if snp_topn and len(pospass)>snp_topn:
			break
		if genes_topn and len(genepass)>genes_topn:
			break
		icnt+=1

	
	#print(passindex[:10])
	#print(df_eqtl.index.values().tolist()[:10])


	#df_eqtl_pass=df_eqtl.loc[df_eqtl.index[passindex]]
	printdisplay(MESSAGE_DEBUG,'df_eqtl=' + str(df_eqtl.shape))
	df_eqtl_pass=df_eqtl.loc[passindex]
	printdisplay(MESSAGE_DEBUG,'df_eqtl_pass=' + str(df_eqtl_pass.shape))


	if annot:


		df_eqtl_pass.to_csv(infile + '.df_eqtl_pass.tsv',sep="\t",index=False)

		printdisplay(MESSAGE_DEBUG,'genepass=' + str(len(genepass)))
		printdisplay(MESSAGE_DEBUG,'pospass=' + str(len(pospass)))

		df_genetopn=get_genes_by_accession0(list(genepass),annot)
		df_locustopn=get_genes_by_position0(list(pospass),annot)

		#df_genetopn=read_url0("https://icgrc.info/api/user/gene/all?annot=all&accession=" + ",".join(list(genepass)), dflabel+"_expgenes_top" + str(topn) +'_' + suf ,requery=True)
		#df_locustopn=read_url0("https://icgrc.info/api/user/gene/" + ",".join(list(pospass)) + "?annot=cs10", dflabel+"_snpgenes_top" + str(topn) +'_' + suf  ,requery=True)
		#df_locustopn=read_url0("https://icgrc.info/api/user/gene/" + ",".join(list(pospass)) + "?annot=cs10", dflabel+"_snpgenes_top" + str(topn) +'_' + suf  ,requery=True)

		printdisplay(MESSAGE_DEBUG,'df_genetopn')
		printdisplay(MESSAGE_DEBUG,df_genetopn)
		printdisplay(MESSAGE_DEBUG,'df_locustopn')
		printdisplay(MESSAGE_DEBUG,df_locustopn)

		with open(infile+'.top_' + suf + '.geneannots.tsv','wt') as fout:
			fout.write("pvalue\tsnps\tsnpgenes\tcontig\tstart\tend\tnote\tgoterm\tiprterm\texpgenes\tnote\tgoterm\tiprterm\n")
			prevctgminmax=None

			for index, row in df_eqtl_pass.iterrows():
				#snppos=row['snps'].split("_")[:-1]
				#ctgpos="_".join(snppos).split(":")
				ctgpos=row['snps'].split(":")

				genepos=row['gene']
				df_expgene=df_genetopn[df_genetopn['name']==genepos]

				golist=[]
				gblist=[]
				iprlist=[]
				namelist=[]

				for index1, row1 in df_expgene.iterrows():
					df_expgene3=df_genetopn[(df_genetopn['contig']==row1['contig']) & (df_genetopn['fmin']==row1['fmin']) & (df_genetopn['fmax']==row1['fmax'])] 
					for index2, row2 in df_expgene3.iterrows():
						namelist.append(row2['name'])
						#if not (row2['gb_keyword'].isnan() or len(str(row2['gb_keyword']))==0):
						if not (len(str(row2['gb_keyword']))==0 or str(row2['gb_keyword'])=='nan'):
							gblist.append(str(row2['gb_keyword']))
						#if not (row2['go_term'].isnan() or len(str(row2['go_term']))==0):
						if not (len(str(row2['go_term']))==0 or str(row2['go_term'])=='nan'):
							golist.append(str(row2['go_term']))
						#if not (row2['interpro_value'].isnan() or len(str(row2['interpro_value']))==0):
						if not (len(str(row2['interpro_value']))==0 or str(row2['interpro_value'])=='nan'):
							iprlist.append(str(row2['interpro_value']))
						#	printdisplay(MESSAGE_DEBUG,"EXP GENES:  pvalue:" + str(row['pvalue']) + "\t" +  str(row2['name']) + "\t" + str(row2['fmin']) + "\t" +  str(row2['fmax']) + "\t" +  str(row2['gb_keyword']) + "\t" +   str(row2['go_term']) + "\t" +   str(row2['interpro_value']))


				# name    contig  fmin    fmax    strand  gb_keyword      go_term interpro_value
				df_snpgene=df_locustopn[ (df_locustopn['contig']==ctgpos[0]) & (df_locustopn['fmin']<=int(ctgpos[1])) & (df_locustopn['fmax']>=int(ctgpos[1]))]
				golistsnp=[]
				gblistsnp=[]
				iprlistsnp=[]
				namelistsnp=[]
				prevctgminmax=None
				for index2, row2 in df_snpgene.iterrows():
					#printdisplay(MESSAGE_DEBUG,"\t\tSNP GENE\t" + row['snps'] + "\t" + row2['name'] + "\t" + str(row2['fmin']) + "\t" +  str(row2['fmax']) + "\t" + str(row2['gb_keyword']) + "\t" +  str(row2['go_term']) + "\t" +  str(row2['interpro_value']))

					if not prevctgminmax or (prevctgminmax[0]==row2['contig']  and prevctgminmax[1]==row2['fmin']  and  prevctgminmax[2]==row2['fmax']):
						namelistsnp.append(row2['name'])
						if not (len(str(row2['gb_keyword']))==0 or str(row2['gb_keyword'])=='nan'):
							gblistsnp.append(str(row2['gb_keyword']))
						if not (len(str(row2['go_term']))==0 or str(row2['go_term'])=='nan'):
							golistsnp.append(str(row2['go_term']))
						if not (len(str(row2['interpro_value']))==0 or str(row2['interpro_value'])=='nan'):
							iprlistsnp.append(str(row2['interpro_value']))
					else:
						if prevctgminmax: 
							#fout.write(str(row['pvalue']) + "\t" + row['snps'] + "\t" + row2['name'] + "\t" + str(row2['contig']) + "\t" +  str(row2['fmin']) + "\t" +  str(row2['fmax']) + "\t" + str(row2['gb_keyword']) + "\t" +  str(row2['go_term']) + "\t" +  str(row2['interpro_value']) + "\t\t" + ",".join(namelist) + "\t"  + ",".join(gblist)  + ",".join(golist)  + ",".join(iprlist) + "\n")
							fout.write(str(row['pvalue']) + "\t" + row['snps'] + "\t" + ",".join(namelistsnp) + "\t" + str(row2['contig']) + "\t" +  str(row2['fmin']) + "\t" + str(row2['fmax'])  + "\t" + ",".join(gblistsnp) +  "\t" + ",".join(golistsnp)   +  "\t" + ",".join(iprlistsnp)  + "\t" + ",".join(namelist) + "\t"  + ",".join(gblist)  + ",".join(golist)  + ",".join(iprlist) + "\n")
							golistsnp=[]
							gblistsnp=[]
							iprlistsnp=[]
							namelistsnp=[]
					prevctgminmax=(row2['contig'],row2['fmin'],row2['fmax'])
				if prevctgminmax:
					fout.write(str(row['pvalue']) + "\t" + row['snps'] + "\t" + ",".join(namelistsnp) + "\t" + str(row2['contig']) + "\t" +  str(row2['fmin']) + "\t" + str(row2['fmax'])  + "\t" + ",".join(gblistsnp) +  "\t" + ",".join(golistsnp)   +  "\t" + ",".join(iprlistsnp)  + "\t" + ",".join(namelist) + "\t"  + ",".join(gblist)  + ",".join(golist)  + ",".join(iprlist) + "\n")
		df_genes=pd.read_csv(infile+'.top_' + suf + '.geneannots.tsv',sep="\t",index_col=None)
		df_genes=df_genes.rename(columns={"pvalue":"pvalue_y"})
		#"pvalue\tsnps\tsnpgene\tcontig\tstart\tend\tnote\tgoterm\tiprterm\texpgenes\tnote\tgoterm\tiprterm\n"
		df_eqtl_pass=pd.merge(df_eqtl_pass, df_genes, on='snps', how='left')
		try:
			df_eqtl_pass['snpgenes']=df_eqtl_pass['snpgenes'].map(select_xmfromlist,'ignore')
			df_eqtl_pass['expgenes']=df_eqtl_pass['expgenes'].map(select_xmfromlist,'ignore')
		except:
			df_eqtl_pass['snpgenes']=df_eqtl_pass['snpgenes'].applymap(select_xmfromlist,'ignore')
			df_eqtl_pass['expgenes']=df_eqtl_pass['expgenes'].applymap(select_xmfromlist,'ignore')

	return df_eqtl_pass


def verify_samples0(df, datatype1, datatype2=None, common_only=False,dtproperty=None):


	df_clean1=get_clean_samples(df,datatype1)
	(properties1,  samples1)=get_properties_samples(df_clean1,datatype1)
	if len(samples1)==0:
		printdisplay(MESSAGE_INFO,'No samples for ' + datatype1)
		return None
	if len(properties1)==0:
		printdisplay(MESSAGE_INFO,'No properties for ' + datatype1)
		return None

	printdisplay(MESSAGE_DEBUG,datatype1 + ' samples ' + str(len(samples1)) + '  ' + str(samples1))

	if datatype2:
		df_clean2=get_clean_samples(df,datatype2)
		(properties2,  samples2)=get_properties_samples(df_clean2,datatype2)
		if len(samples2)==0:
			printdisplay(MESSAGE_INFO,'No samples for ' + datatype2)
			return None
		if len(properties2)==0:
			printdisplay(MESSAGE_INFO,'No properties for ' + datatype2)
			return None
		else:
			if dtproperty:
				if not dtproperty in properties2:
					printdisplay(MESSAGE_INFO,'No property ' + dtproperty + '  ' + str(properties2))
					return None
				else:
					# samples with property only
					printdisplay(MESSAGE_INFO,"Check columns with value for " + dtproperty)


	#printdisplay(MESSAGE_DEBUG,df_clean_exp)
	printdisplay(MESSAGE_DEBUG,datatype2 + ' samples ' + str(len(samples2)) + '  ' + str(samples2))

	if common_only:
		comset=set(samples1).intersection(set(samples2))
		comset.discard('datatype')
		comset.discard('property')
		if len(comset)==0:
			printdisplay(MESSAGE_INFO,'No common samples for ' + datatype1 + ',' + datatype2)
			return None
		#printdisplay(MESSAGE_DEBUG,datatype1 + ',' + datatype2 + ' common samples ' + str(len(comset)) + '  ' + str(comset))

		common_samples=['datatype','property']
		common_samples=common_samples + list(sorted(comset))
		printdisplay(MESSAGE_INFO,"common samples  " +str(len(common_samples)))
		printdisplay(MESSAGE_DEBUG,str(common_samples))
		df_clean1=df_clean1[common_samples]
		df_clean2=df_clean2[common_samples]
		#df_clean12=df_clean1.copy()

		#if verbose:
		if False:
			printdisplay(MESSAGE_DEBUG,'df_clean1')
			printdisplay(MESSAGE_DEBUG,df_clean1)
			printdisplay(MESSAGE_DEBUG,'df_clean2')
			printdisplay(MESSAGE_DEBUG,df_clean2)
			df_clean12=get_clean_samples(df_clean1,datatype2)
			printdisplay(MESSAGE_DEBUG,'df_clean12 samples')
			printdisplay(MESSAGE_DEBUG,list(df_clean12.columns.values))

		df_clean12=df[common_samples]
		return [df_clean1, df_clean2,df_clean12]

	return [df_clean1]



def get_plink_common_samples(df, datatype1, plink_file):
	plinksamples=set()
	with open(plink_file + '.fam') as fin:
		line=fin.readline().strip()
		while line:
			cols=line.split(" ",3)
			plinksamples.add(cols[1].strip())
			line=fin.readline().strip()

	df_clean_datatype=get_clean_samples(df,datatype1)
	setcommon=set(df_clean_datatype.columns.values.tolist()).intersection(plinksamples)
	printdisplay(MESSAGE_DEBUG,'common samples of ' + datatype1 + ' with ' + plink_file + ' = ' + str(len(setcommon)))  # + "\t" + str(setcommon))
	comcols=['datatype','property'] + sorted(list(setcommon))
	return df[comcols]
	#return setcommon

def replace_NAwithblank(x):
	try:
		if x=='NA':
			return ''
	except Exception as ex:
		printdisplay(MESSAGE_DEBUG,x)
		raise ex
	return x.strip()


def clean_project(dflabel):
	osmv(dflabel + '_lastquery.tsv bak_' + dflabel + '_lastquery.tsv')
	osrm(dflabel + '*')
	osmv('bak_' + dflabel + '_lastquery.tsv ' + dflabel + '_lastquery.tsv')

def do_matrixeqtl0(df_snpexp, dflabel,annot,maxp=None,genes_topn=None,snp_topn=None,topn=None,df_gene=None,recodeplink=None,plink_file=None,exp_file=None,expgeneloc_file=None,recalc=False):
#def do_matrixeqtl0(df_snpexp, dflabel,annot,df_gene=None,recalc=False, maxp=None,genes_topn=None,snp_topn=None,topn=None,suf="",annot='all'):

	if recalc:
		clean_project(dflabel)

	if recalc or not os.path.exists(dflabel+'.out.all.eqtl.txt'):

		recode='AD'
		df_clean_expsnp=None
		if plink_file: # or exp_file:

			df_expall=df_snpexp
			if exp_file:
				df_expall=pd.read_csv(exp_file,sep="\t",index_col=None)

			#df_clean_snpexp=get_plink_common_samples(df_snpexp,TYPE_EXP,plink_file)
			df_clean_snpexp=get_plink_common_samples(df_expall,TYPE_EXP,plink_file)

			df_clean_expsnp=df_expall[df_clean_snpexp.columns.values.tolist()]

			df_clean_exp=get_clean_samples(df_clean_snpexp,TYPE_EXP) # df_clean_snpexp[df_clean_snpexp['datatype']==TYPE_EXP]
			df_clean_exp=df_clean_exp.drop('datatype', axis=1)
			if df_clean_exp.shape[1]==1:
				printdisplay(MESSAGE_INFO,'No common samples of EXP with plinkfile ' + plink_file)
				return None # pd.DataFrame(list()):


			if recalc or not os.path.exists(dflabel + '_snp.tsv'):
				#if not os.path.exists('snp_' + dflabel + "_recodeAD.raw"):
				if os.path.exists(plink_file + '-recodeAtranspose.traw'):
					printdisplay(MESSAGE_INFO,'using ' + plink_file + '-recodeAtranspose.traw')
					#osrm("snp_" + dflabel + "_recodeAtranspose.traw")
					#cmd="ln -s " + plink_file + "-recodeAtranspose.traw  snp_" + dflabel + "_recodeAtranspose.traw"
					#printdisplay(MESSAGE_DEBUG,cmd)
					#ossystem(cmd)
				else:
					cmd=plink_path + " --bfile " + plink_file  + " --recode A-transpose --out " + plink_file + "-recodeAtranspose --allow-extra-chr"
					printdisplay(MESSAGE_DEBUG,cmd)
					ossystem(cmd)

				osrm("snp_" + dflabel + "_recodeAtranspose.traw")
				cmd="ln -s " + plink_file + "-recodeAtranspose.traw  snp_" + dflabel + "_recodeAtranspose.traw"
				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)


		else:
			[df_clean_exp, df_clean_snp,df_clean_expsnp]=verify_samples0(df_snpexp, TYPE_EXP, datatype2=TYPE_SNP, common_only=True)
			df_clean_exp=df_clean_exp.drop(['datatype'],axis=1)

			printdisplay(MESSAGE_DEBUG,'dim(df_clean_snp)')
			printdisplay(MESSAGE_DEBUG,df_clean_snp.shape)
			printdisplay(MESSAGE_DEBUG,str(list(df_clean_snp.columns.values)))

			printdisplay(MESSAGE_DEBUG,'dim(df_clean_exp)')
			printdisplay(MESSAGE_DEBUG,df_clean_exp.shape)
			printdisplay(MESSAGE_DEBUG,str(list(df_clean_exp.columns.values)))

			#do_recode_plink(df_clean_snp, dflabel,recode='AD',recodeplink=recodeplink,datatype=TYPE_EXP)
			do_recode_plink(df_clean_expsnp, dflabel,recode='A-transpose',recodeplink=recodeplink,datatype=TYPE_EXP)

		# transpose
		#if os.path.exists("snp_" + dflabel + "_recodeAtranspose.traw"):
		#if True: #os.path.exists("snp_" + dflabel + "_recodeAtranspose.traw"):
		if recalc or  not os.path.exists(dflabel + '_snp.tsv'):

			if os.path.exists("snp_" + dflabel + "_recodeAtranspose.traw"):
				cmd="cut -f1,4 snp_" + dflabel + "_recodeAtranspose.traw > snp_" + dflabel + "_recodeAtranspose.traw.postmp"
				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)

				with open("snp_" + dflabel + "_recodeAtranspose.traw.postmp") as fin,  open("snp_" + dflabel + "_recodeAtranspose.traw.pos",'wt') as fout:
					line=fin.readline()
					line=fin.readline().strip()
					fout.write("\n")
					while line:
						cols=line.split("\t")
						fout.write(cols[0] + ":" +  cols[1].zfill(9) + "\n")
						line=fin.readline().strip()

				osrm("snp_" + dflabel + "_recodeAtranspose.traw.postmp")
				cmd="head -n1 snp_" + dflabel + "_recodeAtranspose.traw | cut -f1,2,3,4,5,6  --complement" 
				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)


				hasexpsample=set(df_clean_exp.columns.values.tolist())
				headers = subprocess.check_output(cmd, shell=True).decode('utf-8').split("\t")
				newheaders=[]
				colcnt=7
				excludesamn=[]
				for samplehead in headers:
					samn=samplehead.split("_")[1].strip()
					if samn in hasexpsample:
						newheaders.append(samn)
					else:
						excludesamn.append(str(colcnt))
					colcnt+=1


				print('plink samples with exp data ' + str(len(newheaders)) + "/" + str(len(headers)))
				excludesamnstr=""
				if len(excludesamn)>0:
					excludesamnstr="," + ",".join(excludesamn)
				cmd="cut -f1,2,3,4,5,6" + excludesamnstr + " --complement snp_" + dflabel + "_recodeAtranspose.traw | tail -n +2 | sed 's/NA//g' > snp_" + dflabel + "_recodeAtranspose.traw.allele"
				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)


				cmd="echo " + "\t".join(newheaders) + " > snp_" + dflabel + "_recodeAtranspose.traw.newheader"
				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)
				
				cmd="cat  snp_" + dflabel + "_recodeAtranspose.traw.newheader snp_" + dflabel + "_recodeAtranspose.traw.allele > " + dflabel + '_snp.tsvtmp'
				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)

				cmd="paste snp_" + dflabel + "_recodeAtranspose.traw.pos " + dflabel + "_snp.tsvtmp > " + dflabel + '_snp.tsv' 
				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)

				osrm("snp_" + dflabel + "_recodeAtranspose.traw.newheader")
				osrm( dflabel + '_snp.tsvtmp')
				osrm( "snp_" + dflabel + "_recodeAtranspose.traw.allele")
				osrm("snp_" + dflabel + "_recodeAtranspose.traw.postmp")
				osrm("snp_" + dflabel + "_recodeAtranspose.traw.pos")
				


			elif True: #os.path.exists('snp_' + dflabel + "_recodeAD.raw"):
				printdisplay(MESSAGE_DEBUG,'reading ' + 'snp_' + dflabel + "_recodeAD.raw")
				dfsnp=pd.read_csv( 'snp_' + dflabel + "_recodeAD.raw",sep=" ")
				dfsnp = dfsnp.drop(['FID','PAT','MAT','SEX', 'PHENOTYPE'],axis=1)
				dfsnp = dfsnp.rename(columns={"IID": "id"})
				dfsnp.set_index('id',inplace=True)
				#printdisplay(MESSAGE_DEBUG,dfsnp)
				printdisplay(MESSAGE_DEBUG,'transposing ' + 'snp_' + dflabel + "_recodeAD.raw")
				dfsnp=dfsnp.transpose()
				for col in dfsnp.columns.values.tolist():
					dfsnp[col] = dfsnp[col].astype('Int64')


				printdisplay(MESSAGE_DEBUG,'dim(dfsnp)')
				printdisplay(MESSAGE_DEBUG,dfsnp.shape)
				printdisplay(MESSAGE_DEBUG,list(dfsnp.columns.values))

				printdisplay(MESSAGE_DEBUG,'dim(df_clean_exp)')
				printdisplay(MESSAGE_DEBUG,df_clean_exp.shape)
				printdisplay(MESSAGE_DEBUG,list(df_clean_exp.columns.values))
				dfsnp.to_csv(dflabel + '_snp.tsv',sep="\t")



		printdisplay(MESSAGE_DEBUG,'saving ' + dflabel + '_exp.tsv')
		#df_clean_exp.iloc[:,1:].to_csv(dflabel + '_exp.tsv',sep="\t")
		df_clean_exp.to_csv(dflabel + '_exp.tsv',sep="\t",index=False)


		print(sorted(df_clean_expsnp.columns.values.tolist()))
		print(sorted(df_clean_expsnp.index.values.tolist()))
		printdisplay(MESSAGE_INFO,df_clean_expsnp, 'df_clean_expsnp')
		dfannot=df_clean_expsnp[df_clean_expsnp['property'].isin(["NCBI BioProject","expression_dataset"])]
		dfannot=dfannot[df_clean_exp.columns.values.tolist()]
		dfannot=dfannot.set_index('property')
		dfannot.to_csv(dflabel + '_exp.annot.tsv',sep="\t",index=True)

		#to_csv0(df_clean_expsnp,dflabel+'_exp.annot.tsv',annot=["NCBI BioProject"])

		#ossystem("cut -f1 " + dflabel + '_snp.tsv' + '.tsv | tail -n +2 > ' + dflabel + '_snp.snpsloc'
		with open(dflabel + '_snp.tsv') as fin, open(dflabel + '_snp.snpsloc.tsv','wt') as fout:
			line=fin.readline() 
			line=fin.readline().strip() 
			fout.write("snp\tchr\tpos\n")
			while line:
				#NC_044374.1:052618376_A 
				cols=line.split("\t",2)
				ctg=cols[0].split(":")[0]
				pos=cols[0].split(":")[1].split("_")[0]
				fout.write(cols[0] + "\t" + ctg + "\t" + str(int(pos)) + "\n")
				line=fin.readline().strip() 


		setGenes=set(list(df_clean_exp['property']))

	#df_genes
	#name	contig	fmin	fmax	strand	gb_keyword	go_term	interpro_value
	#LOC115717841	NC_044374.1	52617968	52619389	1	"cannabidiolic acid synthase-like"	"GO:0071949,FAD binding, GO:0050660,flavin adenine dinucleotide binding, GO:0016491,oxidoreductase activity, GO:0055114,oxidation-reduction process"	"IPR036318,
	#geneid	chr	s1	s2
	#Gene_01	chr1	721289	731289
	#Gene_02	chr1	752565	762565

	#	printdisplay(MESSAGE_DEBUG,'setGenes')
	#	printdisplay(MESSAGE_DEBUG,list(setGenes))
		do_cistrans=False
		if True: # and len(setGenes)<200:
			if expgeneloc_file:
				osrm( dflabel + '_exp.genesloc.tsv')
				cmd="ln -s " + expgeneloc_file  + " "  + dflabel + '_exp.genesloc.tsv'
				printdisplay(MESSAGE_INFO,cmd)
				ossystem(cmd)
			elif df_gene:
				df_geneloc=df_gene['name','contig','fmin','fmax'].rename(columns={'name':'geneid','contig':'chr','fmin':'s1','fmax':'s2'})
				df_geneloc.to_csv(dflabel + '_exp.genesloc.tsv',sep="\t",index=False)
			else:
				if not df_gene:
					#df_gene=read_url0("https://icgrc.info/api/user/gene/all?annot=all&accession=" + ",".join(list(setGenes)), dflabel+"_genes" ,requery=recalc)
					df_gene=get_genes_by_accession0(list(setGenes),annot)
				#eqtl_21trichs_21trichcs10_exp.genesloc.tsv
				with open(dflabel + '_exp.genesloc.tsv','wt') as fout:
					df_gene=df_gene[['name','contig','fmin','fmax']]
					fout.write("geneid\tchr\ts1\ts2\n")
					for index, row in df_gene.iterrows():
						if row['name'] in setGenes:
							fout.write(row['name']+"\t"+row['contig']+"\t"+str(row['fmin'])+"\t"+str(row['fmax'])+"\n")
			do_cistrans=True
		else:
			printdisplay(MESSAGE_DEBUG,"nGenes = " + str(len(setGenes)) + " >200,  cis/trans is not separated\nRequires genome coordinates of all genes/mRNAs, rerun and provide the coordinate file")

		cmd="Rscript matrix_eqtl.R " + dflabel + " " + str(maxp)
			
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem("rm Rplots*.pdf")
		ossystem(cmd)
		ossystem("mv Rplots.pdf " + dflabel + '.all.eqtlpvalue.pdf')
		#printdisplay(MESSAGE_DEBUG,'check ' + dflabel + '.eqtlpvalue.pdf')
		pdf2png(dflabel + '.all.eqtlpvalue.pdf')
		printdisplay(MESSAGE_DEBUG,"generated " + dflabel+'.out.all.eqtl.txt')
		#oscp(dflabel+'.out.all.eqtl.txt ' + dflabel+'.out.all.eqtl.txt.bak')


	pltshow(MESSAGE_INFO, dflabel + '.all.eqtlpvalue.png')

	if recalc or  not os.path.exists(dflabel + '.eqtl.all.top.tsv'):
		printdisplay(MESSAGE_DEBUG,"\t\tALL\t\t")
		df_eqtl_all=do_matrixeqtl_getgenes(dflabel+'.out.all.eqtl.txt',dflabel,maxp=maxp,genes_topn=genes_topn,snp_topn=snp_topn,topn=topn,suf="all",annot=annot)
		#printdisplay(MESSAGE_DEBUG,'generated ' + dflabel+'.out.all.eqtl.txt'+'.top_all.geneannots.tsv')
		df_eqtl_all.to_csv(dflabel + '.eqtl.all.top.tsv',sep="\t",index=False)
		#printdisplay(MESSAGE_INFO,'generated ' + dflabel + '.eqtl.all.top.tsv')
	else:
		df_eqtl_all=pd.read_csv(dflabel + '.eqtl.all.top.tsv',sep="\t")

	do_cistrans=True
	df_eqtl_cis=None
	df_eqtl_trans=None

	if do_cistrans:

		if recalc or not os.path.exists(dflabel + '.eqtl.trans.top.tsv'):

			if recalc or  not (os.path.exists(dflabel+'.out.cis.eqtl.txt') and os.path.exists(dflabel+'.out.trans.eqtl.txt')):
				ossystem("rm Rplots*.pdf")
				cmd="Rscript matrix_eqtl_cistrans.R " + dflabel  + " " + str(maxp)
				printdisplay(MESSAGE_DEBUG,cmd)
				ossystem(cmd)
				ossystem("mv Rplots.pdf " + dflabel + '.cistrans.eqtlpvalue.pdf')
				pdf2png(dflabel + '.cistrans.eqtlpvalue.pdf')

			printdisplay(MESSAGE_DEBUG,"\t\tCIS\t\t")
			df_eqtl_cis=do_matrixeqtl_getgenes(dflabel+'.out.cis.eqtl.txt',dflabel,maxp=maxp,genes_topn=genes_topn,snp_topn=snp_topn,topn=topn,suf="cis",annot=annot)

			printdisplay(MESSAGE_DEBUG,"\t\tTRANS\t\t")
			df_eqtl_trans=do_matrixeqtl_getgenes(dflabel+'.out.trans.eqtl.txt',dflabel,maxp=maxp,genes_topn=genes_topn,snp_topn=snp_topn,topn=topn,suf="trans",annot=annot)
			#printdisplay(MESSAGE_DEBUG,'generated ' + dflabel+'.out.cis.eqtl.txt'+'.top_cis.geneannots.tsv')
			#printdisplay(MESSAGE_DEBUG,'generated ' + dflabel+'.out.trans.eqtl.txt'+'.top_trans.geneannots.tsv')

			df_eqtl_cis.to_csv(dflabel + '.eqtl.cis.top.tsv',sep="\t",index=False)
			df_eqtl_trans.to_csv(dflabel + '.eqtl.trans.top.tsv',sep="\t",index=False)
			#printdisplay(MESSAGE_INFO,'generated ' + dflabel + '.eqtl.cis.top.tsv')
			#printdisplay(MESSAGE_INFO,'generated ' + dflabel + '.eqtl.trans.top.tsv')
		else:
			df_eqtl_cis=pd.read_csv(dflabel + '.eqtl.cis.top.tsv',sep="\t")
			df_eqtl_trans=pd.read_csv(dflabel + '.eqtl.trans.top.tsv',sep="\t")



		pltshow(MESSAGE_INFO,dflabel + '.cistrans.eqtlpvalue.png')



	if os.path.exists( dflabel+'.out.cis.eqtl.txt.top_cis.geneannots.tsv'):
		printdisplay(MESSAGE_INFO,dflabel + '.out.cis.eqtl.txt.top_cis.geneannots.tsv' ,'top snp-expressed cis gene annotated')
	else:
		printdisplay(MESSAGE_INFO,dflabel + '.eqtl.cis.top.tsv','top snp-expressed cis gene')

	if os.path.exists( dflabel+'.out.trans.eqtl.txt.top_trans.geneannots.tsv'):
		printdisplay(MESSAGE_INFO,dflabel + '.eqtl.trans.top.tsv','top snp-expressed trans gene annotated')
	else:
		printdisplay(MESSAGE_INFO,dflabel + '.eqtl.trans.top.tsv','top snp-expressed trans gene')

	if os.path.exists( dflabel+'.out.all.eqtl.txt.top_all.geneannots.tsv'):
		printdisplay(MESSAGE_INFO,dflabel + '.eqtl.all.top.tsv','top snp-expressed all gene annotated')
	else:
		printdisplay(MESSAGE_INFO,dflabel + '.eqtl.all.top.tsv','top snp-expressed all gene ' + dflabel + '.eqtl.all.top.tsv')


	return (df_eqtl_all,df_eqtl_cis,df_eqtl_trans)


def do_matrixeqtl_top(df_snpexp, dflabel,annot,maxp=None,genes_topn=None,snp_topn=None,topn=None,df_gene=None,recodeplink=None,plink_file=None,exp_file=None,expgeneloc_file=None):

	if not os.path.exists(dflabel+'.out.all.eqtl.txt'):
		do_matrixeqtl0(df_snpexp, dflabel,annot,maxp=maxp,genes_topn=genes_topn,snp_topn=snp_topn,topn=topn,df_gene=df_gene,recodeplink=recodeplink,plink_file=plink_file,exp_file=exp_file,expgeneloc_file=expgeneloc_file)

	printdisplay(MESSAGE_DEBUG,"\t\tALL\t\t")
	df_eqtl_all=do_matrixeqtl_getgenes(dflabel+'.out.all.eqtl.txt',dflabel,maxp=maxp,genes_topn=genes_topn,snp_topn=snp_topn,topn=topn,suf="all",annot=annot)

	printdisplay(MESSAGE_DEBUG,'generated ' + dflabel+'.out.all.eqtl.txt'+'.top_all.geneannots.tsv')
	df_eqtl_cis=None
	df_eqtl_trans=None

	if do_cistrans:
		cmd="Rscript matrix_eqtl_cistrans.R " + dflabel 
		printdisplay(MESSAGE_DEBUG,cmd)
		ossystem(cmd)
		pdf2png(dflabel + '.eqtlqqplot.pdf')

		printdisplay(MESSAGE_DEBUG,"\t\tCIS\t\t")
		df_eqtl_cis=do_matrixeqtl_getgenes(dflabel+'.out.cis.eqtl.txt',dflabel,maxp=maxp,genes_topn=genes_topn,snp_topn=snp_topn,topn=topn,suf="cis",annot=annot)

		printdisplay(MESSAGE_DEBUG,"\t\tTRANS\t\t")
		df_eqtl_trans=do_matrixeqtl_getgenes(dflabel+'.out.cis.eqtl.txt',dflabel,maxp=maxp,genes_topn=genes_topn,snp_topn=snp_topn,topn=topn,suf="trans",annot=annot)
		printdisplay(MESSAGE_DEBUG,'generated ' + dflabel+'.out.cis.eqtl.txt'+'.top_cis.geneannots.tsv')
		printdisplay(MESSAGE_DEBUG,'generated ' + dflabel+'.out.trans.eqtl.txt'+'.top_trans.geneannots.tsv')

		df_eqtl_cis.to_csv(dflabel + '.eqtl.cis.top.tsv',sep="\t",index=False)
		df_eqtl_trans.to_csv(dflabel + '.eqtl.trans.top.tsv',sep="\t",index=False)
		printdisplay(MESSAGE_INFO,'generated ' + dflabel + '.eqtl.cis.top.tsv')
		printdisplay(MESSAGE_INFO,'generated ' + dflabel + '.eqtl.trans.top.tsv')

	df_eqtl_all.to_csv(dflabel + '.eqtl.all.top.tsv',sep="\t",index=False)
	printdisplay(MESSAGE_INFO,'generated ' + dflabel + '.eqtl.all.top.tsv')


	return (df_eqtl_all,df_eqtl_cis,df_eqtl_trans)



def select_xmfromlist(x):
	try:
		for xi in x.split(","):
			if xi.strip().startswith('XM'):
				return xi.strip()
	except Exception as ex:
		printdisplay(MESSAGE_DEBUG,x)
		raise ex
	return x.strip()



#df_eqtl_wgcna_onsnpgene=None
#df_gwas=None


def merge_matrixeqtl_wgcna_gwas0(eqtls=None, wgcnas=None,gwass=None,outfile='merged_snpgene_trait.tsv',use_snpgene=True,df_genes=None,recalc=False,requery=False,maxp_wgcna=None,maxp_eqtl=None,maxp_gwas=None):
	#global dummytable,df_gwas
	df_eqtl_wgcna_onsnpgene=None
	df_gwass=None
	df_eqtls=None
	df_wgcnas=None

	if recalc:
		osrm('*' + outfile)

	if eqtls:
		for eqtl_label in eqtls:
			df_eqtl=None
			if isinstance(eqtl_label,str): 
				#df_eqtl=pd.read_csv(eqtl_label, sep="\t", usecols=['snps','snpgenes','expgenes','pvalue'])
				df_eqtl=pd.read_csv(eqtl_label, sep="\t") 
			elif isinstance(eqtl_label, pd.DataFrame):
				df_eqtl=eqtl_label.copy()

			cols=df_eqtl.columns.values.tolist()
			includecols=[]
			includecols.append("snps")
			rename=dict()
			if 'snpgenes' in cols:
				includecols.append("snpgenes")
			if 'expgenes' in cols:
				includecols.append("expgenes")
			if 'gene' in cols:
				includecols.append("gene")
				if not 'expgenes' in cols:
					rename['gene']='expgenes'
			includecols.append("pvalue")
			df_eqtl=df_eqtl[includecols]

			if maxp_eqtl:
				df_eqtl=df_eqtl[df_eqtl['pvalue']<=maxp_eqtl]

			if len(rename)>0:
				df_eqtl=df_eqtl.rename(columns=rename)


			if use_snpgene and (not df_genes is None) and (not 'expgenes' in df_eqtl.columns.values.tolist()) or  (not 'snpgenes' in df_eqtl.columns.values.tolist()):
				df_eqtl=do_snps_getgenes(df_eqtl,df_genes,'eqtl_' + outfile,snpcolumn="snps",other_columns=['expgenes','pvalue'],requery=recalc) 

			cols=df_eqtl.columns.values.tolist()
			#if 'snpgenes' in cols:
			try:
				df_eqtl['snpgenes']=df_eqtl['snpgenes'].map(select_xmfromlist,'ignore')
				df_eqtl['expgenes']=df_eqtl['expgenes'].map(select_xmfromlist,'ignore')	#df_eqtl.apply(lambda x: select_fromlist(x['expgenes'], keep='XM'), axis=1)
			except:
				df_eqtl['snpgenes']=df_eqtl['snpgenes'].applymap(select_xmfromlist,'ignore')
				df_eqtl['expgenes']=df_eqtl['expgenes'].applymap(select_xmfromlist,'ignore')	#df_eqtl.apply(lambda x: select_fromlist(x['expgenes'], keep='XM'), axis=1)

			if not df_eqtl is None:
				df_eqtls=pd.concat([df_eqtls, df_eqtl])
			else:
				df_eqtls=df_eqtl


		printdisplay(MESSAGE_DEBUG,'df_eqtls ' + str(df_eqtls.shape))
		printdisplay(MESSAGE_DEBUG,list(df_eqtls.columns.values))


	if wgcnas:
		#df_wgcna=pd.read_csv(wgcna_label + '.relateModsToExt.top.genetrait.annot.tsv' , sep="\t", usecols=['substanceBXH','trait','GS','p.GS'])
		for wgcna_label in wgcnas:
			df_wgcna=None
			if isinstance(wgcna_label,str): 
				df_wgcna=pd.read_csv(wgcna_label, sep="\t", usecols=['trait','expgenes','GS','p.GS'])
			elif isinstance(wgcna_label, pd.DataFrame):
				df_wgcna=wgcna_label.copy()
			if not df_wgcna is None:
				df_wgcnas=pd.concat([df_wgcnas, df_wgcna])
			else:
				df_wgcnas=df_wgcna
		
		if maxp_wgcna:
			df_wgcnas=df_wgcnas[df_wgcnas['p.GS']<=maxp_wgcna]

		printdisplay(MESSAGE_DEBUG,'df_wgcnas ' + str(df_wgcnas.shape))
		printdisplay(MESSAGE_DEBUG,list(df_wgcnas.columns.values))



	if os.path.exists('gwass_renamedtrait_' + outfile) and not recalc:
		printdisplay(MESSAGE_DEBUG,'loading ' + 'gwass_renamedtrait_' + outfile)
		df_gwass=pd.read_csv('gwass_renamedtrait_' + outfile,sep="\t")
		if maxp_gwas:
			df_gwass=df_gwass[df_gwass['pvalue']<=maxp_gwas]
		printdisplay(MESSAGE_DEBUG,'loaded ' + 'gwass_renamedtrait_' + outfile)
		printdisplay(MESSAGE_DEBUG,'df_gwass ' + str(df_gwass.shape))
		printdisplay(MESSAGE_DEBUG,list(df_gwass.columns.values))

	elif gwass:
		for gwas_label in gwass:
			df_gwas=None
			if isinstance(gwas_label,str): 
				#df_gwas=pd.read_csv(gwas_label, sep="\t", usecols=['snps','snpgenes','trait','pvalue'])
				df_gwas=pd.read_csv(gwas_label, sep="\t") #, usecols=['snps','snpgenes','trait','pvalue'])
			elif isinstance(gwas_label, pd.DataFrame):
				df_gwas=gwas_label.copy()

			#trait   CHR     snps    BP      NMISS   BETA    SE      R2      T       pvalue
			#linalool        NC_044370.1     NC_044370.1000102305    102305  18      0.285   2.083e-09       1.0     136900000.0     5.571000000000001e-122

			cols=df_gwas.columns.values.tolist()
			includecols=[]
			includecols.append("snps")
			rename=dict()
			if 'snpgenes' in cols:
				includecols.append("snpgenes")
			includecols.append("trait")
			includecols.append("pvalue")
			df_gwas=df_gwas[includecols]

			if len(rename)>0:
				df_gwas=df_gwas.rename(columns=rename)

			if maxp_gwas:
				df_gwas=df_gwas[df_gwas['pvalue']<=maxp_gwas]

			#if use_snpgene and not df_genes is None:
			if use_snpgene and (not df_genes is None) and (not 'snpgenes' in df_gwas.columns.values.tolist()):

				df_gwas=do_snps_getgenes(df_gwas,df_genes,'gwas_' + outfile,snpcolumn="snps",other_columns=['trait','pvalue'],requery=recalc) 

			cols=df_gwas.columns.values.tolist()
			#if 'snpgenes' in cols:
			try:
				df_gwas['snpgenes']=df_gwas['snpgenes'].map(select_xmfromlist,'ignore')
			except:
				df_gwas['snpgenes']=df_gwas['snpgenes'].applymap(select_xmfromlist,'ignore')

			if not df_gwass is None:
				df_gwass=pd.concat([df_gwass, df_gwas])
			else:
				df_gwass=df_gwas

		# df_gwass group by snpgene



		printdisplay(MESSAGE_DEBUG,'df_gwass ' + str(df_gwass.shape))
		printdisplay(MESSAGE_DEBUG,list(df_gwass.columns.values))


	# needed by triple merge
	df_eqtl_wgcna_onexpgenes=None
	if recalc or (not df_wgcnas is None and not df_eqtls is None and not os.path.exists('eqtl_wgcna_onexpgene_' + outfile)):

		printdisplay(MESSAGE_DEBUG,'df_eqtls= ' + str(df_eqtls.columns.values.tolist()))
		printdisplay(MESSAGE_DEBUG,'df_wgcnas= ' +str(df_wgcnas.columns.values.tolist()))

		df_eqtl_wgcna_onexpgenes=pd.merge(df_eqtls,df_wgcnas,on="expgenes")
		printdisplay(MESSAGE_DEBUG,'df_eqtl_wgcna expgenes')
		printdisplay(MESSAGE_DEBUG,list(df_eqtl_wgcna_onexpgenes.columns.values.tolist()))
		printdisplay(MESSAGE_DEBUG,df_eqtl_wgcna_onexpgenes)
		'''
		df_wgcna_copy=df_wgcna.rename(columns={"substanceBXH":"snpgene"})
		df_eqtl_wgcna_onsnpgene=pd.merge(df_eqtl,df_wgcna_copy,on="snpgene")
		printdisplay(MESSAGE_DEBUG,'df_eqtl_wgcna snpgene')
		printdisplay(MESSAGE_DEBUG,list(df_eqtl_wgcna_onsnpgene.columns.values))
		printdisplay(MESSAGE_DEBUG,df_eqtl_wgcna_onsnpgene)
		'''
		df_eqtl_wgcna_onexpgenes.to_csv('eqtl_wgcna_onexpgene_' + outfile,sep="\t",index=False)
		printdisplay(MESSAGE_DEBUG,'df_eqtl_wgcna_onexpgenes ' + str(df_eqtl_wgcna_onexpgenes.columns.values.tolist()))
		printdisplay(MESSAGE_INFO,'generated ' + 'eqtl_wgcna_onexpgene_' + outfile)
		printdisplay(MESSAGE_INFO,df_eqtl_wgcna_onexpgenes, 'eqtl_wgcna_onexpgene_' + outfile)
	else:
		df_eqtl_wgcna_onexpgenes=pd.read_csv('eqtl_wgcna_onexpgene_' + outfile,sep="\t")
		printdisplay(MESSAGE_INFO,'loaded ' + 'df_eqtl_wgcna_onexpgenes_' + outfile)
		printdisplay(MESSAGE_INFO,df_eqtl_wgcna_onexpgenes, 'eqtl_wgcna_onexpgene_' + outfile)



	if not df_eqtls is None and not df_gwass is None:
		try:
			df_eqtls["snpgenes"]=df_eqtls["snpgenes"].map(select_xmfromlist,'ignore')
			df_gwass["snpgenes"]=df_gwass["snpgenes"].map(select_xmfromlist,'ignore')
		except:
			df_eqtls["snpgenes"]=df_eqtls["snpgenes"].applymap(select_xmfromlist,'ignore')
			df_gwass["snpgenes"]=df_gwass["snpgenes"].applymap(select_xmfromlist,'ignore')

		# get common snpgenes

		if use_snpgene:
			if not os.path.exists('eqtl_gwas_onsnpgene_' + outfile) or recalc:
				eqtls_snpgenes=set(df_eqtls["snpgenes"].tolist())
				gwass_snpgenes=set(df_gwass["snpgenes"].tolist())
				commonsnphenes=eqtls_snpgenes.intersection(gwass_snpgenes)
				df_eqtls_snpgenes=df_eqtls[df_eqtls["snpgenes"].isin(commonsnphenes)]
				df_gwass_snpgenes=df_gwass[df_gwass["snpgenes"].isin(commonsnphenes)]
				df_eqtls_snpgenes.loc[df_eqtls_snpgenes['snpgenes']=='','snpgenes']= np.nan # df_eqtls_snpgenes['snpgenes'].replace('', np.nan)

				df_eqtls_snpgenes=df_eqtls_snpgenes.dropna(subset=['snpgenes'])
				df_gwass_snpgenes.loc[df_gwass_snpgenes['snpgenes']=='','snpgenes']= np.nan #df_gwass_snpgenes['snpgenes'].replace('', np.nan)
				df_gwass_snpgenes=df_gwass_snpgenes.dropna(subset=['snpgenes'])

				df_eqtl_gwas_onsnpgene=pd.merge(df_eqtls_snpgenes,df_gwass_snpgenes,on="snpgenes")

				printdisplay(MESSAGE_INFO,'skipping eqtl_gwas_onsnpgene_' +  outfile + '  shape=' + str(df_eqtl_gwas_onsnpgene.shape))

		else:
			if not os.path.exists('eqtl_gwas_onsnp_' + outfile) or recalc:
				eqtls_snpgenes=set(df_eqtls["snps"].tolist())
				gwass_snpgenes=set(df_gwass["snps"].tolist())
				commonsnphenes=eqtls_snpgenes.intersection(gwass_snpgenes)
				df_eqtls_snpgenes=df_eqtls[df_eqtls["snps"].isin(commonsnphenes)]
				df_gwass_snpgenes=df_gwass[df_gwass["snps"].isin(commonsnphenes)]

				df_eqtl_gwas_onsnp=pd.merge(df_eqtls,df_gwass,on="snps")
				printdisplay(MESSAGE_DEBUG,'df_eqtl_gwas snps')
				printdisplay(MESSAGE_DEBUG,list(df_eqtl_gwas_onsnp.columns.values))
				printdisplay(MESSAGE_DEBUG,df_eqtl_gwas_onsnp)
				df_eqtl_gwas_onsnp.to_csv('eqtl_gwas_onsnp_' + outfile,sep="\t",index=False)
			
			
	if recalc or ( not os.path.exists('wgcna_gwas_ontrait_' + outfile) and not df_wgcnas is None and not df_gwass is None):

		df_gwas_cp=df_gwass.rename(columns={"phen":"trait"})
		# rename gwas trait
		for itrait in set(df_gwas_cp['trait'].tolist()).difference(['Percent_Total_Cannabinoids_per_dry_weight', 'THCA_to_CBDA_ratio', 'Total_cannabinoids', 'Total_monoterpenes', 'Total_sesquiterpenes', 'Total_terpenoids']):
			#df_gwas_cp[df_gwas_cp[df_gwas_cp['trait']==itrait], 'trait']=itrait.replace(",",".").replace("-",".").replace(" ",".")

			df_gwas_cp.loc[df_gwas_cp['trait']==itrait, 'trait']=itrait.replace(",",".").replace("-",".").replace(" ",".").replace("_",".").replace("\xa0","")
			#df.loc[df['First Season'] > 1990, 'First Season'] = 1
		df_gwass=df_gwas_cp

		printdisplay(MESSAGE_DEBUG,'df_gwas_cp= ' + str(df_gwas_cp.columns.values.tolist()))
		printdisplay(MESSAGE_DEBUG,'df_wgcnas= ' +str(df_wgcnas.columns.values.tolist()))

		
		df_wgcna_gwas_ontrait=pd.merge(df_wgcnas,df_gwas_cp,on="trait")

		printdisplay(MESSAGE_INFO,'skipping ' + 'wgcna_gwas_ontrait_' + outfile + '  shape=' + str(df_wgcna_gwas_ontrait.shape))
		
		df_gwass.to_csv('gwass_renamedtrait_' + outfile,sep="\t",index=False)
		printdisplay(MESSAGE_DEBUG,'generated ' + 'gwass_renamedtrait_' + outfile)
		printdisplay(MESSAGE_DEBUG,'gwass_renamedtrait_' + outfile)



	if not os.path.exists(outfile) or recalc:

		if True and  (not df_eqtl_wgcna_onexpgenes is None) and not (df_gwass is None):

			printdisplay(MESSAGE_DEBUG,'calculating ' + outfile )
			if False: # already renamed in (df_wgcnas is None and not df_gwass)
				df_gwas_cp=df_gwass.rename(columns={"phen":"trait"})
				# rename gwas trait
				for itrait in set(df_gwas_cp['trait'].tolist()).difference(['Percent_Total_Cannabinoids_per_dry_weight', 'THCA_to_CBDA_ratio', 'Total_cannabinoids', 'Total_monoterpenes', 'Total_sesquiterpenes', 'Total_terpenoids']):
					df_gwas_cp.loc[df_gwas_cp['trait']==itrait, 'trait']=itrait.replace(",",".").replace("-",".").replace(" ",".").replace("_",".").replace("\xa0","")
				df_gwass=df_gwas_cp


			if use_snpgene:



				printdisplay(MESSAGE_DEBUG,'df_gwass=' + str(df_gwass.shape))
				printdisplay(MESSAGE_DEBUG,'df_eqtl_wgcna_onexpgenes=' +str(df_eqtl_wgcna_onexpgenes.shape) + '  ' + str(df_eqtl_wgcna_onexpgenes.columns.values.tolist()))
				#df_eqtl_wgcna_onexpgenes=df_eqtl_wgcna_onexpgenes[['trait','snpgenes','pvalue','expgenes','GS', 'p.GS']].drop_duplicates()
				df_eqtl_wgcna_onexpgenes=df_eqtl_wgcna_onexpgenes[['trait','snpgenes','pvalue','expgenes','GS', 'p.GS']] #.groupby(['trait','snpgenes']).agg({})
				
				df_eqtl_wgcna_onexpgenes=df_eqtl_wgcna_onexpgenes.rename(columns={'pvalue':'pvalue_eqtl','GS':'GS_wgcna', 'p.GS':'pGS_wgcna'}).groupby(['trait','snpgenes'], as_index=False).agg({'pvalue_eqtl':['count','min'],'GS_wgcna':['max'],'pGS_wgcna':['count','min'] })


				printdisplay(MESSAGE_DEBUG,'df_gwass=' +str(df_gwass.shape) + '  ' + str(df_gwass.columns.values.tolist()))
				df_gwass=df_gwass[['trait','snpgenes','pvalue']] #.drop_duplicates()
				df_gwass=df_gwass.rename(columns={'pvalue':'pvalue_gwas'}).groupby(['trait','snpgenes'], as_index=False).agg({'pvalue_gwas':['count','min']})

				printdisplay(MESSAGE_DEBUG,'unique trait-snpgene')
				printdisplay(MESSAGE_DEBUG,'df_gwass=' +str(df_gwass.shape))
				printdisplay(MESSAGE_DEBUG,'df_eqtl_wgcna_onexpgenes=' +str(df_eqtl_wgcna_onexpgenes.shape))

				df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait=pd.merge(df_eqtl_wgcna_onexpgenes,df_gwass, on=['trait','snpgenes']) #.drop_duplicates()
				printdisplay(MESSAGE_DEBUG,'df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait=' +str(df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait.shape))

				df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait=df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait.sort_values(['snpgenes','trait'])
				printdisplay(MESSAGE_INFO,'df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait ' + str(df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait.shape) + "\n" + str(df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait.columns.values.tolist()))
				df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait.to_csv(outfile,sep="\t",index=False)
				printdisplay(MESSAGE_INFO,'using unfiltered ' + outfile)
				#printdisplay(MESSAGE_INFO,'generated ' + outfile)


			else:
				df_eqtl_wgcna_onexpgenes_gwas_onsnptrait=pd.merge(df_eqtl_wgcna_onexpgenes,df_gwass, on=['trait','snps'])
				printdisplay(MESSAGE_INFO,'df_eqtl_wgcna_onexpgenes_gwas_onsnptrait trait')
				printdisplay(MESSAGE_INFO,list(df_eqtl_wgcna_onexpgenes_gwas_onsnptrait.columns.values.tolist()))
				printdisplay(MESSAGE_INFO,str(df_eqtl_wgcna_onexpgenes_gwas_onsnptrait.shape))
				df_eqtl_wgcna_onexpgenes_gwas_onsnptrait.to_csv(outfile,sep="\t",index=False)


			printdisplay(MESSAGE_INFO,'generated ' + outfile)
		else:
			printdisplay(MESSAGE_DEBUG, 'NOT MET (not df_eqtl_wgcna_onexpgenes is None) and not (df_gwass is None)')
			printdisplay(MESSAGE_DEBUG,df_eqtl_wgcna_onexpgenes)
			printdisplay(MESSAGE_DEBUG,df_gwass)

	else:
		printdisplay(MESSAGE_DEBUG,outfile + ' exists')
		if use_snpgene:
			df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait=pd.read_csv(outfile,sep="\t")
			printdisplay(MESSAGE_INFO,'loaded ' + outfile)
			printdisplay(MESSAGE_INFO,df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait,'df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait')


	if use_snpgene:
		# combine 1st and seconf lines from grouped
		newcols=[]
		with open(outfile) as fin, open('groupcols_' +  outfile,'wt') as fout:
			cols1=fin.readline().rstrip().split("\t")
			cols2=fin.readline().rstrip().split("\t")
			newcols.append(cols1[0])
			newcols.append(cols1[1])
			for i in range(2,len(cols1)):
				newcols.append("_".join([cols1[i], cols2[i]]))
			fout.write("\t".join(newcols) + "\n")
			line=fin.readline()
			while line:
				fout.write(line)
				line=fin.readline()

		#osrm(outfile)
		#osmv(outfile+'.groupcols.tsv ' + outfile)
		df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait=pd.read_csv('groupcols_' + outfile,sep="\t")
		df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait=pd.merge(df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait,clean_df_gene(df_genes),left_on='snpgenes',right_on='gene',how='left')
		#df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait=do_snps_getgenes(df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait,df_genes,None, snpcolumn='snps',other_columns=['trait','pvalue_eqtl','GS_wgcna','pGS_wgcna','pvalue_gwas'])
		df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait.to_csv('triple_snpgenetrait_' + outfile,sep="\t",index=False)
		#printdisplay(MESSAGE_INFO,'generated ' + 'triple_snpgenetrait_' + outfile)
		printdisplay(MESSAGE_INFO, 'triple_snpgenetrait_' + outfile,'triple_snpgenetrait_' + outfile)


	#df_eqtl_wgcna_onexpgenes_gwas_onsnptrait

	#if not os.path.exists('distinct-snpgenetrait-1-' + outfile):
	if False:
		if use_snpgene:
			if recalc or not os.path.exists('eqtl_wgcna_onexpgene-gwas_ontraitsnpgenes-' + outfile):
				printdisplay(MESSAGE_INFO,'df_eqtl_wgcna_onexpgenes_gwas_onsnptrait ' + str(df_eqtl_wgcna_onexpgenes_gwas_onsnptrait.columns.values.tolist()))
				df_eqtl_wgcna_onexpgenes_df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait__nsnpgenetrait=pd.merge(df_eqtl_wgcna_onexpgenes,df_eqtl_wgcna_onexpgenes_gwas_onsnptrait,on=['trait','snpgenes'])
				df_eqtl_wgcna_onexpgenes_df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait__nsnpgenetrait.to_csv('eqtl_wgcna_onexpgene-gwas_ontraitsnpgenes-' + outfile, sep="\t",index=False)
				printdisplay(MESSAGE_DEBUG,'df_eqtl_wgcna_onexpgenes_df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait__nsnpgenetrait ' + str(df_eqtl_wgcna_onexpgenes_df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait__nsnpgenetrait.columns.values.tolist()))
				printdisplay(MESSAGE_INFO,'generated ' + 'eqtl_wgcna_onexpgene-gwas_ontraitsnpgenes-' + outfile)
			else:
				df_eqtl_wgcna_onexpgenes_df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait__nsnpgenetrait=pd.read_csv('eqtl_wgcna_onexpgene-gwas_ontraitsnpgenes-' + outfile, sep="\t")

			#expgenes_x      pvalue  snps    snpgenes        contig  start   end     note    goterm  iprterm trait   GS_x    p.GS_x  expgenes_y      pvalue_x        snps_x  contig_x        start_x end_x   note_x  goterm_x        iprterm_x       GS_y    p.GS_y  pvalue_y        snps_y  contig_y        start_y end_y   note_y  goterm_y        iprterm_y
			df_unique=df_eqtl_wgcna_onexpgenes_df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait__nsnpgenetrait.drop_duplicates()
			df_unique.to_csv('eqtl_wgcna_onexpgene-gwas_ontraitsnpgenes-distinct-' + outfile ,sep="\t",index=False)

			df_unique_snpexptrait=df_unique[['snpgenes','expgenes_x','trait', 'note','goterm','note_x','goterm_x' ]].drop_duplicates().sort_values(['snpgenes','trait'])
			df_unique_snpexptrait.to_csv('eqtl_wgcna_onexpgene-gwas_ontraitsnpgenes-distinct-snpgenexpgenetrait' + outfile)
			df_unique_snptrait=df_unique_snpexptrait[['snpgenes','trait','note','goterm','note_x','goterm_x']].drop_duplicates()
			df_unique_snptrait.to_csv('eqtl_wgcna_onexpgene-gwas_ontraitsnpgenes-distinct-snpgenetrait-2-' + outfile,sep="\t",index=False)

			#expgenes        pvalue_x        snps_x  snpgenes        contig_x        start_x end_x   note_x  goterm_x        iprterm_x       trait   GS      p.GS    pvalue_y        snps_y  contig_y        start_y end_y   note_y  goterm_y        iprterm_y
			df_eqtl_wgcna_onexpgenes_gwas_onsnptrait=df_eqtl_wgcna_onexpgenes_gwas_onsnptrait[['snpgenes','trait','note_x','goterm_x','note_y','goterm_y' ]].drop_duplicates()
			df_eqtl_wgcna_onexpgenes_gwas_onsnptrait.to_csv('distinct-snpgenetrait-1-' + outfile,sep="\t",index=False)


def merge_gwas_wgcna0(gwas_label, wgcna_label):
	df_wgcna=pd.read_csv(wgcna_label + '.relateModsToExt.top.genetrait.annot.tsv' , sep="\t")



def get_genes_by_accession0(acclist,annot='all',index_col=None):

	batch=100

	done=0
	cnt=0
	acclist=sorted(list(set(acclist)))
	nlist=len(acclist)
	df=None
	printdisplay(MESSAGE_DEBUG,'get_genes_by_accession0 ' + str(nlist))

	fname='get_genes_by_accession0_' + myhash(str(acclist))+ "_" + annot + '.tsv'
	if os.path.exists(fname):
		df=pd.read_csv(fname,sep='\t',index_col=index_col)
	else:
		while cnt<nlist:
			batchlist=acclist[cnt:(min(cnt+batch,nlist))]

			#myurl="https://icgrc.info/api/user/gene/all?annot=" + annot accession=" + ",".join(batchlist) 
			myurl="https://icgrc.info/api/user/gene/all?annot=" + annot + "&accession=" + ",".join(list(batchlist))
			printdisplay(MESSAGE_DEBUG,myurl) 
			c=pd.read_csv(myurl,sep='\t',index_col=index_col)
			#c=read_url0("https://icgrc.info/api/user/gene/all?annot=all&accession=" + ",".join(list(genepass)), dflabel+"_expgenes_top" + str(topn) +'_' + suf ,requery=True)rea

			c.rename(columns = {c: c.strip() for c in c.columns}, inplace = True)
			c['name_idx']=c['name']
			c=c.set_index(['name_idx'])

			if df is None:
				df=c
			else:
				df=pd.concat([df, c], axis=0) 

			cnt+=batch

		#dfcopy=df.copy()
		for index, row1 in df.iterrows():
				df_expgene3=df[(df['contig']==row1['contig']) & (df['fmin']==row1['fmin']) & (df['fmax']==row1['fmax'])]
				namelist=set() 
				gblist=set()
				golist=set()
				iprlist=set()
				for index2, row2 in df_expgene3.iterrows():
					#namelist.add(index) #row2['name'])
					namelist.add(row2['name'])
					#if not (row2['gb_keyword'].isnan() or len(str(row2['gb_keyword']))==0):
					if not (len(str(row2['gb_keyword']))==0 or str(row2['gb_keyword'])=='nan'):
						gblist.add(str(row2['gb_keyword']))
					#if not (row2['go_term'].isnan() or len(str(row2['go_term']))==0):
					if not (len(str(row2['go_term']))==0 or str(row2['go_term'])=='nan'):
						golist.add(str(row2['go_term']))
					#if not (row2['interpro_value'].isnan() or len(str(row2['interpro_value']))==0):
					if not (len(str(row2['interpro_value']))==0 or str(row2['interpro_value'])=='nan'):
						iprlist.add(str(row2['interpro_value']))
					#	printdisplay(MESSAGE_DEBUG,"EXP GENES:  pvalue:" + str(row['pvalue']) + "\t" +  str(row2['name']) + "\t" + str(row2['fmin']) + "\t" +  str(row2['fmax']) + "\t" +  str(row2['gb_keyword']) + "\t" +   str(row2['go_term']) + "\t" +   str(row2['interpro_value']))
				if row1['name'] in acclist:
				#if index in acclist:
					df.at[index,'gb_keyword']=",".join(list(gblist)) + " (" + ",".join(list(namelist)) +")"
					df.at[index,'go_term']=",".join(list(golist))
					df.at[index,'interpro_value']=",".join(list(iprlist))
		
		df=df.sort_values(['contig','fmin','fmax'])
		#df=dfcopy
		df.to_csv(fname, sep="\t", encoding="utf-8", index=False)
	
	printdisplay(MESSAGE_DEBUG,'get_genes_by_accession0 ' + str(df.shape))
	return df


def get_genes_by_position0(poslist,annot,index_col=None):

	batch=100
	done=0
	cnt=0

	poslist=sorted(list(set(poslist)))
	nlist=len(poslist)
	df=None
	printdisplay(MESSAGE_DEBUG,'get_genes_by_position0 ' + str(nlist))
	#print('get_genes_by_position0 ' + str(nlist))

	fname='get_genes_by_position0_' + myhash(str(poslist))+ "_" + annot + '.tsv'
	if os.path.exists(fname):
		printdisplay(MESSAGE_DEBUG,"reading cached file " + fname) 
		df=pd.read_csv(fname,sep='\t',index_col=index_col)
	else:
		nlist=len(poslist)
		printdisplay(MESSAGE_DEBUG,'get_genes_by_position0 ' + str(nlist))
		df=None
		while cnt<nlist:
			batchlist=poslist[cnt:(min(cnt+batch,nlist))]
			myurl="https://icgrc.info/api/user/gene/" + ",".join(batchlist) + "?annot=" + annot
			printdisplay(MESSAGE_DEBUG,myurl) 
			c=pd.read_csv(myurl,sep='\t',index_col=index_col)
			c.rename(columns = {c: c.strip() for c in c.columns}, inplace = True)

			if df is None:
				df=c
			else:
				df=pd.concat([df, c], axis=0) 

			cnt+=batch
	
		df=df.sort_values(['contig','fmin','fmax'])
		df.to_csv(fname, sep="\t", encoding="utf-8", index=False)

	#df.to_csv(outfile,  sep="\t", encoding="utf-8")
	printdisplay(MESSAGE_DEBUG,'get_genes_by_position0 ' + str(df.shape))
	return df



def clean_property(x):
	return str(sorted(list(set(str(x).split(","))))).replace("[","").replace("]","").strip()

def check_batcheffects0(df, dflabel,datatype, batch_id='NCBI BioProject',on_missing='most_frequent',whiten=False,correct_BE=False,be_method='pycombat'): # , 'expression_dataset','phenotype_dataset']):
	properties=[batch_id]
	dforig=df
	#keepid=['NCBI BioProject', 'expression_dataset']
	dfclean=df[(df['datatype']==datatype) | (df['property'].isin(properties)) ]
	printdisplay(MESSAGE_DEBUG,dfclean)
	#.drop('datatype',axis=1).set_index('property').transpose()
	#df=dfclean[(dfclean['datatype']==datatype)].drop('datatype',axis=1).set_index('property').transpose()
	df=dfclean.drop('datatype',axis=1).set_index('property').transpose()
	#display(df)

	df[properties[0]]=df[properties[0]].map(clean_property)
	#printdisplay(MESSAGE_INFO,df,'df')

	#check_pca0(df,dflabel,properties[0],plot_features=True)
	dfcorrected=check_pca0(df,dflabel,properties[0],do_pca=True,whiten=whiten,on_missing=on_missing,correct_BE=correct_BE,be_method=be_method)
	dfcorrected['property']=dfcorrected.index
	if True: #correct_BE:

		#idpops=[x for x in dforig.columns.values.tolist() if x not in dfcorrected.columns.values.tolist()]
		#newdf=dforig[ (dforig['datatype']==TYPE_ID) |  (dforig['datatype']==TYPE_PROP)]
		
		#newdf=dforig[dforig['datatype'].isin([TYPE_ID,TYPE_PROP])]
		newdf=dforig[dforig['datatype']!=datatype]

		#printdisplay(MESSAGE_DEBUG,dfcorrected,'dfcorrected')
		#printdisplay(MESSAGE_DEBUG,newdf,'newdf')

		dfcorrected['datatype']=datatype	
		#dfcorrected['property'] = dfcorrected.index
		try:
			dfcorrected.reset_index(inplace=True)
		except Exception as ex:
			printdisplay(MESSAGE_INFO,str(ex))
			pass

		#printdisplay(MESSAGE_DEBUG,dfcorrected,'dfcorrected')

		newdf=pd.concat([newdf,dfcorrected]).reset_index(drop=True)

		if 'index' in newdf.columns.values.tolist():
			newdf=newdf.drop(['index'], axis=1)

		#.reset_index(drop=True).drop(['index'], axis=1)

		printdisplay(MESSAGE_INFO,dforig,'original data')
		#printdisplay(MESSAGE_INFO,newdf,'generated corrected ' + datatype)

		newdf.to_csv('becorrected_' + dflabel + '.tsv',sep="\t",index=False)
		printdisplay(MESSAGE_INFO,'generated becorrected_' + dflabel + '.tsv')
		printdisplay(MESSAGE_INFO,newdf,'becorrected_' + dflabel + '.tsv')
		
			
		mapbatches=dict()
		
		#display(df[properties[0]].tolist())
		for batch in list(set(df[properties[0]].tolist())):
			batch=batch.replace("'","")
		#for batch in ["'Zager2019'", "'Booth2020'", "'GloerfeltTarp2023'"]:
			mapbatches[batch]=select_columns_byproperty(newdf,properties[0],batch)

		return (newdf,mapbatches)




def check_batcheffects1(df, dflabel,datatype, batch_id='NCBI BioProject',on_missing='most_frequent',whiten=False,correct_BE=False): # , 'expression_dataset','phenotype_dataset']):
	properties=[batch_id]
	dforig=df
	#keepid=['NCBI BioProject', 'expression_dataset']
	dfclean=df[(df['datatype']==datatype) | (df['property'].isin(properties)) ]
	printdisplay(MESSAGE_DEBUG,dfclean)
	#.drop('datatype',axis=1).set_index('property').transpose()
	#df=dfclean[(dfclean['datatype']==datatype)].drop('datatype',axis=1).set_index('property').transpose()
	df=dfclean.drop('datatype',axis=1).set_index('property').transpose()
	#display(df)

	df[properties[0]]=df[properties[0]].map(clean_property)
	#printdisplay(MESSAGE_INFO,df,'df')

	#check_pca0(df,dflabel,properties[0],plot_features=True)
	dfcorrected=check_pca0(df,dflabel,properties[0],do_pca=True,whiten=whiten,on_missing=on_missing,correct_BE=correct_BE)
	dfcorrected['property']=dfcorrected.index
	if correct_BE:
		#idpops=[x for x in dforig.columns.values.tolist() if x not in dfcorrected.columns.values.tolist()]
		#newdf=dforig[ (dforig['datatype']==TYPE_ID) |  (dforig['datatype']==TYPE_PROP)]
		
		newdf=dforig[dforig['datatype'].isin([TYPE_ID,TYPE_PROP])]
		
		#printdisplay(MESSAGE_DEBUG,dfcorrected,'dfcorrected')
		#printdisplay(MESSAGE_DEBUG,newdf,'newdf')

		dfcorrected['datatype']=datatype	
		#dfcorrected['property'] = dfcorrected.index
		try:
			dfcorrected.reset_index(inplace=True)
		except Exception as ex:
			printdisplay(MESSAGE_INFO,str(ex))
			pass

		#printdisplay(MESSAGE_DEBUG,dfcorrected,'dfcorrected')

		newdf=pd.concat([newdf,dfcorrected]).reset_index(drop=True)

		if 'index' in newdf.columns.values.tolist():
			newdf=newdf.drop(['index'], axis=1)

		#.reset_index(drop=True).drop(['index'], axis=1)

		printdisplay(MESSAGE_INFO,dforig,'original data')
		#printdisplay(MESSAGE_INFO,newdf,'generated corrected ' + datatype)

		newdf.to_csv('becorrected_' + dflabel + '.tsv',sep="\t",index=False)
		printdisplay(MESSAGE_INFO,'generated becorrected_' + dflabel + '.tsv')
		printdisplay(MESSAGE_INFO,newdf,'becorrected_' + dflabel + '.tsv')
		
			
		mapbatches=dict()
		
		#display(df[properties[0]].tolist())
		for batch in list(set(df[properties[0]].tolist())):
			batch=batch.replace("'","")
		#for batch in ["'Zager2019'", "'Booth2020'", "'GloerfeltTarp2023'"]:
			mapbatches[batch]=select_columns_byproperty(newdf,properties[0],batch)

		return (newdf,mapbatches)



from ppca import PPCA
#import pyppca2
from pyppca import ppca
from combat.pycombat import pycombat


def plot_pcas(components,df,dfvalues,ncomp,features3dpca,colorby,colorsequence,xy,outfile):

	imgscale=3
	opacity=0.7
	size_max=3

	labels = {
		str(i): f"PC {i+1}"
		for i in range(ncomp)
	}

	columnspca = ['PC%i' % i for i in range(1,ncomp+1)]
	df_pca  = pd.DataFrame(components, columns=columnspca, index=dfvalues.index)
	printdisplay(MESSAGE_DEBUG,str(df_pca.index.tolist()),'df_pca.index.tolist()')
	printdisplay(MESSAGE_DEBUG,str(df.index.tolist()),'df.index.tolist()')

	df_pca['sample'] = df_pca.index
	df['sample'] = df.index
	printdisplay(MESSAGE_DEBUG,df_pca,'df_pca')
	printdisplay(MESSAGE_DEBUG,df,'df')
	#df.set_index('property')
	df_dfpca=pd.merge(df_pca,df,on='sample')
	#df_dfpca=pd.merge(df,df_pca,left_on='property',right_on='sample')
	printdisplay(MESSAGE_DEBUG,df_dfpca,'df_dfpca')
	#df=df_dfpca
	fig = px.scatter_matrix(
		df_dfpca,
		labels=labels,
		dimensions=features3dpca, #range(ncomp),
		color=colorby,
		#symbol='marker',
		opacity =opacity,
		#size='dummy_column_for_size',
		size_max=size_max,
		color_discrete_sequence=colorsequence

	)
	#fig.update_traces(diagonal_visible=False)
	fig.update_traces(diagonal_visible=False,marker=dict(line=dict(width=0)))
	#fig.show()
	if xy:
		fig.write_image(outfile + '.pca.png', width=xy[0], height=xy[1],scale=imgscale)	
	else:
		fig.write_image(outfile + '.pca.png',scale=imgscale)	
	printdisplay(MESSAGE_INFO,'generated ' + outfile + '.pca.png')
	pltshow(MESSAGE_INFO,fig)

	#pltshow(MESSAGE_INFO,outfile + '.pca.png')


	fig = px.scatter_3d(df_dfpca,  x=features3dpca[0], y=features3dpca[1], z=features3dpca[2],
		color=colorby,
		#symbol='marker',
		size_max=size_max,
		#size='dummy_column_for_size',
		opacity =opacity,
		#color_discrete_sequence=px.colors.qualitative.Dark24
		color_discrete_sequence=colorsequence
	)
	#fig.update_layout(margin=dict(l=0,r=0,t=0,b=0), scene_camera=dict(eye=dict(x=2.0, y=2.0, z=0.75)))
	fig.update_layout(margin=dict(l=0,r=0,t=0,b=0), scene_camera=dict(eye=dict(x=2.5, y=2.5, z=1)))

	if xy:
		fig.write_image(outfile + '.pca.3d.png', width=xy[0], height=xy[1],scale=imgscale)	
	else:
		fig.write_image(outfile + '.pca.3d.png',scale=imgscale)	

	pltshow(MESSAGE_INFO,fig)
	printdisplay(MESSAGE_INFO,'generated ' + outfile + '.pca.3d.png')


	fig = px.scatter_3d(df_dfpca,  x=features3dpca[0], y=features3dpca[1], z=features3dpca[2],
		color=colorby,
		#symbol='marker',
		size_max=10,
		#size='dummy_column_for_size_large',
		#hover_name="sample", hover_data=["geogroup"],
		opacity =opacity,
		#color_discrete_sequence=px.colors.qualitative.Dark24
		color_discrete_sequence=colorsequence
	)
	#fig.update_layout(margin=dict(l=0,r=0,t=0,b=0), scene_camera=dict(eye=dict(x=2.0, y=2.0, z=0.75)))
	fig.update_layout(margin=dict(l=0,r=0,t=0,b=0), scene_camera=dict(eye=dict(x=2.5, y=2.5, z=1)))
	fig.write_html(outfile + '.pca.3d.html')	
	printdisplay(MESSAGE_INFO,'generated ' + outfile + '.pca.3d.html')


def check_pca0(df,inf,colorby,features=None,features3d=None,plot_features=False, do_pca=False, xy=(500,500),whiten=False,ncomp=5,features3dpca=('PC1','PC2','PC3'),on_missing='most_frequent',correct_BE=False,be_method='ppca'): #SimpleImputer, PPCA,most_frequent
	#features = ["sepal_width", "sepal_length", "petal_width", "petal_length"]

	imgscale=3
	opacity=0.7
	size_max=3

	if not features:
		features=[x for x in df.columns.values.tolist() if x not in [colorby]]
	if not features3d:
		features3d=features[:3]


	df['dummy_column_for_size'] = 2
	df['dummy_column_for_size_large'] = 10
	colorsequence=px.colors.qualitative.Dark24 # T10, D3
	outfile=inf
	if plot_features:
		print('colors palletes')
		print(dir(px.colors.qualitative))
		print('colorby=' + str(colorby))


		fig = px.scatter_3d(df,  x=features3d[0], y=features3d[1], z=features3d[2],
			color=colorby,
			#symbol='marker',
			size_max=size_max,
			size='dummy_column_for_size',
			opacity =opacity,
			#color_discrete_sequence=px.colors.qualitative.Dark24
			color_discrete_sequence=colorsequence
		)
		#fig.update_layout(margin=dict(l=0,r=0,t=0,b=0), scene_camera=dict(eye=dict(x=2.0, y=2.0, z=0.75)))
		fig.update_layout(margin=dict(l=0,r=0,t=0,b=0), scene_camera=dict(eye=dict(x=2.5, y=2.5, z=1)))

		if xy:
			fig.write_image(inf + '.3d.png', width=xy[0], height=xy[1],scale=imgscale)	
		else:
			fig.write_image(inf + '.3d.png',scale=imgscale)	

		pltshow(MESSAGE_INFO,fig)
		printdisplay(MESSAGE_INFO,'generated ' + inf + '.3d.png')

		fig = px.scatter_3d(df, x=features3d[0], y=features3d[1], z=features3d[2],
			color=colorby,
			#symbol='marker',
			size_max=10,
			#size='dummy_column_for_size_large',
			#hover_name="sample", hover_data=["geogroup"],
			opacity =opacity,
			#color_discrete_sequence=px.colors.qualitative.Dark24
			color_discrete_sequence=colorsequence
		)
		#fig.update_layout(margin=dict(l=0,r=0,t=0,b=0), scene_camera=dict(eye=dict(x=2.0, y=2.0, z=0.75)))
		fig.update_layout(margin=dict(l=0,r=0,t=0,b=0), scene_camera=dict(eye=dict(x=2.5, y=2.5, z=1)))
		fig.write_html(outfile + '.3d.html')	
		printdisplay(MESSAGE_INFO,'generated ' + inf + '.3d.html')


		fig = px.scatter_matrix(
			df,
			dimensions=features3d,
			color=colorby,
			#symbol='marker',
			opacity =opacity,
			size='dummy_column_for_size',
			size_max=size_max,
			color_discrete_sequence=colorsequence
		)
		#	    color="species"
		fig.update_traces(diagonal_visible=False,marker=dict(line=dict(width=0)))

		#fig.show()

		if xy:
			fig.write_image(outfile + '.png', width=xy[0], height=xy[1],scale=imgscale)	
		else:
			fig.write_image(outfile + '.png',scale=imgscale)	

		pltshow(MESSAGE_INFO,fig)
		printdisplay(MESSAGE_INFO,'generated ' + inf + '.png')


	if do_pca or correct_BE:

		printdisplay(MESSAGE_DEBUG,df,'df')
		#dfvalues=df.set_index('property')
		dfvalues=df[features]
		#dfvalues=drop_allna_rowcol(dfvalues)
		dfvalues=drop_allnazero_rowcol(dfvalues)
		features=dfvalues.columns.values.tolist()

		if dfvalues.isnull().sum().sum()>0 and on_missing is None:
			printdisplay(MESSAGE_INFO,"Not expecting missing values in data")
			return None

		if dfvalues.isnull().sum().sum()>0:
			printdisplay(MESSAGE_INFO,'for PCA has NaN')
			if on_missing=='KNNImputer':
				printdisplay(MESSAGE_INFO,'imputing using KNNImputer')
				imputer = KNNImputer(n_neighbors=2, weights="uniform")
				printdisplay(MESSAGE_DEBUG,dfvalues[features],'imputing using KNNImputer')

				dfvalues[features]=imputer.fit_transform(dfvalues[features])
				printdisplay(MESSAGE_DEBUG,dfvalues[features],'imputed using KNNImputer')

			elif  on_missing in ['most_frequent','mean','median']:
				printdisplay(MESSAGE_INFO,'imputing using ' + on_missing)
				printdisplay(MESSAGE_DEBUG,dfvalues[features],'imputing using ' + on_missing)
				imputer = SimpleImputer(missing_values=np.NaN, strategy=on_missing)
				dfvalues[features]=imputer.fit_transform(dfvalues[features])
				printdisplay(MESSAGE_DEBUG,dfvalues[features],'imputed using ' + on_missing)
			elif on_missing=='ppca':
				pass
			else:
				raise Exception('unknown on_missing ' + on_missing)

			#dfvalues=dfvalues.set_index('property')
		
		printdisplay(MESSAGE_DEBUG,dfvalues,'dfvalues')

		if dfvalues.isnull().sum().sum()>0:
			printdisplay(MESSAGE_INFO,'for PCA has NaN,  on_missing=' + on_missing)



		if on_missing=='ppca':
			printdisplay(MESSAGE_INFO,'imputing using ppca')
			#dfvalues=drop_allnazero_rowcol(dfvalues)
			components, ss, M, X, Ye = ppca(dfvalues.transpose().to_numpy().astype('float'),d=ncomp,dia=False)
			#components, ss, M, X, Ye = ppca2(dfvalues.transpose(),d=ncomp,dia=False)
			#components, ss, M, X, Ye = ppca(dfvalues[features].as_matrix(),d=ncomp,dia=True)
			
			printdisplay(MESSAGE_DEBUG, str(np.shape(components)) ,'components') 
			printdisplay(MESSAGE_DEBUG, str(np.shape(Ye)) ,'Ye') 
			dfvalues=pd.DataFrame(np.transpose(Ye),columns=features, index=dfvalues.index)
			printdisplay(MESSAGE_DEBUG,dfvalues,'data_corrected')


			#ppca=PPCA()
			#ppca.fit(data=dfvalues.to_numpy(), d=ncomp, verbose=True)
			#components = ppca.transform()

			if correct_BE:
				#printdisplay(MESSAGE_INFO,'correcting batch effect using pycombat')

				#dfvalues=drop_allnazero_rowcol(dfvalues)
				printdisplay(MESSAGE_DEBUG,dfvalues,'data_cleaned') #sample x prop
				cleansamples=dfvalues.index.values.tolist()
				cleanprops=dfvalues.columns.values.tolist()
				printdisplay(MESSAGE_DEBUG,[colorby] + cleansamples,'[colorby] + cleansamples')
				printdisplay(MESSAGE_DEBUG,df,'df')

				#cleaddf=df[[colorby] + cleansamples]
				cleaddf=df.loc[cleansamples,[colorby]+cleanprops]
				printdisplay(MESSAGE_DEBUG,cleaddf,'cleaddf')
				#df=cleaddf
				batch=cleaddf[colorby].tolist()
				#batch=[x for x in list(set(cleaddf[colorby].tolist())) if not (str(x) == "'nan'" or pd.isna(x))]
				printdisplay(MESSAGE_DEBUG,batch,'batch')
				#dfvalues = pycombat(dfvalues.transpose().to_numpy().astype('float'),batch)
				printdisplay(MESSAGE_DEBUG,dfvalues,'data_cleaned')

				if be_method in ['eblm', 'rcombat']:
					printdisplay(MESSAGE_INFO,'correcting batch effect using ' + be_method)
					dfannot=pd.DataFrame(cleaddf[colorby],index=cleaddf.index)
					dfvaluesannot=pd.merge(dfvalues,dfannot,left_index=True, right_index=True).reset_index(names='sample') 

					dfvalues=do_be_eblm_values(dfvaluesannot[cleanprops],inf,dfvaluesannot[[colorby, 'sample']],covar=colorby,be_method=be_method)
					newcolumnnames=dfvalues.columns.values.tolist()
					newcolumnnames.remove("X")

					dfvaluessamples=pd.merge(dfvalues,dfvaluesannot['sample'],left_index=True, right_index=True) #right_index=True,left_index=True)
					dfvaluessamples=dfvaluessamples.set_index('sample')
					dfvaluessamples.index.name='index'
					#display(dfvaluessamples)
					dfvalues=dfvaluessamples.loc[:,newcolumnnames]
					#display(dfvalues)
				else:
					printdisplay(MESSAGE_INFO,'correcting batch effect using pycombat')
					dfvalues = pycombat(dfvalues.transpose().astype(float),batch).transpose()
	
				printdisplay(MESSAGE_DEBUG,dfvalues,'data_corrected')

				components_be, ss_be, M_be, X_be, Ye_be = ppca(dfvalues.transpose().to_numpy().astype('float'),d=ncomp,dia=False)
			else:
				return dfvalues.transpose()

		else:
			pca = PCA(ncomp,whiten=whiten) #ProbabilisticPCA(ncomp,whiten=whiten)  #PCA(ncomp,whiten=whiten)
			#dfvalues=drop_allnazero_rowcol(dfvalues)
			components = pca.fit_transform(dfvalues)

			if correct_BE:
				##printdisplay(MESSAGE_INFO,'correcting batch effect using pycombat')
				#dfvalues=drop_allnazero_rowcol(dfvalues)
				printdisplay(MESSAGE_DEBUG,dfvalues,'data_cleaned')
				cleansamples=dfvalues.index.values.tolist()
				cleanprops=dfvalues.columns.values.tolist()
				cleaddf=df.loc[cleansamples,[colorby]+cleanprops]
				#printdisplay(MESSAGE_INFO,cleaddf,'cleaddf')
				#df=cleaddf
				batch=cleaddf[colorby].tolist()
				printdisplay(MESSAGE_DEBUG,batch,'batch')



				if be_method=='eblm1':
					printdisplay(MESSAGE_INFO,'correcting batch effect using eblm')
					dfannot=pd.DataFrame(cleaddf[colorby],index=cleaddf.index)
					dfvaluesannot=pd.merge(dfvalues,dfannot,left_index=True, right_index=True).reset_index(drop=True)

					dfannot=pd.DataFrame(dfvaluesannot[colorby],index=dfvaluesannot.index)

					dfvalues=do_be_eblm_values(dfvaluesannot,inf,dfannot,covar=colorby)
				elif be_method in ['eblm','rcombat']:
					printdisplay(MESSAGE_INFO,'correcting batch effect using ' + be_method)
					dfannot=pd.DataFrame(cleaddf[colorby],index=cleaddf.index)
					dfvaluesannot=pd.merge(dfvalues,dfannot,left_index=True, right_index=True).reset_index(names='sample') 

					dfvalues=do_be_eblm_values(dfvaluesannot[cleanprops],inf,dfvaluesannot[[colorby, 'sample']],covar=colorby,be_method=be_method)
					newcolumnnames=dfvalues.columns.values.tolist()
					newcolumnnames.remove("X")

					dfvaluessamples=pd.merge(dfvalues,dfvaluesannot['sample'],left_index=True, right_index=True) #right_index=True,left_index=True)
					dfvaluessamples=dfvaluessamples.set_index('sample')
					dfvaluessamples.index.name='index'
					#display(dfvaluessamples)
					dfvalues=dfvaluessamples.loc[:,newcolumnnames]
					#display(dfvalues)
				elif be_method=='pycombat':
					printdisplay(MESSAGE_INFO,'correcting batch effect using pycombat')
					dfvalues = pycombat(dfvalues.transpose().astype(float),batch).transpose()
	

				printdisplay(MESSAGE_DEBUG,dfvalues,'data_corrected')

				pca_be = PCA(ncomp,whiten=whiten) #ProbabilisticPCA(ncomp,whiten=whiten)  #PCA(ncomp,whiten=whiten)
				components_be = pca_be.fit_transform(dfvalues)
			else:
				return dfvalues.transpose()



		plot_pcas(components,df,dfvalues,ncomp,features3dpca,colorby,colorsequence,xy,outfile)
		if correct_BE:
			plot_pcas(components_be,df,dfvalues,ncomp,features3dpca,colorby,colorsequence,xy,outfile+'.be')

		
		if correct_BE:
			return dfvalues.transpose()


'''
Select only columns with value in property
'''
def select_columns_byproperty(df_in,prop,value):
	value=value.replace("'","")
	#print('select_columns_byproperty ' + prop + ' ' + value)
	#display(df_in)
	df1=df_in.set_index('property')
	#display(df1)
	df=df1.loc[:, df1.loc[prop] == value]
	#df.loc[:,'datatype']=df1.loc[:,'datatype']
	df.loc[:,'datatype']=df1.loc[:,'datatype']
	#display(df)
	df=df.reset_index()
	col = df.pop("datatype")
	df.insert(0, col.name, col)
	#display(df)
	return df



from math import log

#https://stats.stackexchange.com/questions/24961/comparing-clusterings-rand-index-vs-variation-of-information
#https://cdlib.readthedocs.io/en/latest/reference/eval/cdlib.evaluation.variation_of_information.html
#https://jejjohnson.github.io/research_journal/appendix/similarity/vi/

#https://gist.githubusercontent.com/jwcarr/626cbc80e0006b526688/raw/48846b5975b9d48ce28cbc9ae19a8f70006fdbfa/vi.py
def variation_of_information(X, Y):
	n = float(sum([len(x) for x in X]))
	sigma = 0.0
	for x in X:
		p = len(x) / n
		for y in Y:
			q = len(y) / n
			r = len(set(x) & set(y)) / n
			if r > 0.0:
				sigma += r * (log(r / p, 2) + log(r / q, 2))


	hi=HI(X,Y)
	vi_hi=hi[0]-hi[1]
	nvi=None
	if hi[0]!=0.0:
		nvi=1-hi[1]/hi[0]
	if sigma is None:
		sigma=vi_hi
	#print('variation_of_information=' + str(abs(sigma)) + '  vi_hi' + str(vi_hi) + ' nvi=' + str(nvi))
	return (abs(sigma),nvi)

def chisquare(X,Y,ddof=0):
	n = [len(x) for x in X]
	m = [len(y) for y in Y]
	if sum(n)!= sum(m):
		raise('chisquare n!=m ' + str(n) + ' ' + str(m) )

	from scipy.stats import chisquare
	(stat,pval)=chisquare(f_obs=n, f_exp=m,ddof=ddof)
	return (stat,pval)

def chisquare_contingency0(X,Y):
	xlist=[]
	xs=list(X.keys())
	ys=list(Y.keys())
	
	if len(xs)==0 or len(ys)==0:
		print('len(xs)==0 or len(ys)==0')
		return (None,None)
	for x in X:
		ylist=[]
		for y in ys:
			intersect=len(Y[y].intersection(X[x]))
			if intersect==0:
				print('intersect==0 ' + x + ' ' + y)
				return (None,None)
			ylist.append(intersect)
		xlist.append(ylist)

	table = np.array(xlist)
	print('chisquare_contingency')
	print('xs=' + str(xs))
	print('ys=' + str(ys))
	print(table)

	from scipy.stats import chi2_contingency

	#print(table)
	chi2, p, dof, expected = chi2_contingency(table)

	#printdisplay(MESSAGE_INFO,"chi2 statistic:\t" + str(chi2) +"\np-value:\t" + str(p) + "\ndegrees of freedom:\t" + str(dof) + "\nexpected frequencies:\n" +  str(expected))
	printdisplay(MESSAGE_INFO,'  '.join(["chi2 statistic: " + str(chi2) , "p-value: " + str(p),  "degrees of freedom: " + str(dof) ,  "expected frequencies: " +  str(expected)]))

	'''
	printdisplay(MESSAGE_INFO,f"chi2 statistic:     {chi2:.5g}")
	printdisplay(MESSAGE_INFO,f"p-value:            {p:.5g}")
	printdisplay(MESSAGE_INFO,f"degrees of freedom: {dof}")
	printdisplay(MESSAGE_INFO,"expected frequencies:")
	printdisplay(MESSAGE_INFO,expected)
	'''
	return (chi2,p)

#https://www.scribbr.com/statistics/chi-square-test-of-independence/#:~:text=When%20you%20want%20to%20perform,in%20each%20combination%20of%20groups.


'''
  # Identical partitions
X1 = [ [1,2,3,4,5], [6,7,8,9,10] ]
Y1 = [ [1,2,3,4,5], [6,7,8,9,10] ]
print(variation_of_information(X1, Y1))
# VI = 0

# Similar partitions
X2 = [ [1,2,3,4], [5,6,7,8,9,10] ]
Y2 = [ [1,2,3,4,5,6], [7,8,9,10] ]
print(variation_of_information(X2, Y2))
# VI = 1.102

# Dissimilar partitions
X3 = [ [1,2], [3,4,5], [6,7,8], [9,10] ]
Y3 = [ [10,2,3], [4,5,6,7], [8,9,1] ]
print(variation_of_information(X3, Y3))
# VI = 2.302

# Totally different partitions
X4 = [ [1,2,3,4,5,6,7,8,9,10] ]
Y4 = [ [1], [2], [3], [4], [5], [6], [7], [8], [9], [10] ]
print(variation_of_information(X4, Y4))
# VI = 3.322 (maximum VI is log(N) = log(10) = 3.322)
'''

#Journal of Machine Learning Research 11 (2010) 2837-2854
#Information Theoretic Measures for Clusterings Comparison:
#Variants, Properties, Normalization and Correction for Chance
#https://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf

#Adapting the Right Measures for K-means Clustering


def norm_variation_of_information(X, Y):
	n = float(sum([len(x) for x in X]))
	sigma = 0.0
	for x in X:
		p = len(x) / n
		for y in Y:
			q = len(y) / n
			r = len(set(x) & set(y)) / n
			if r > 0.0:
				sigma += r * (log(r / p, 2) + log(r / q, 2))
	return abs(sigma)


def HI(X,Y):
	n = float(sum([len(x) for x in X]))
	H = 0.0
	I = 0.0
	i=0
	nij=dict()
	for x in X:
		#p = len(x) / n
		j=0
		for y in Y:
			nij[(i,j)]=len(set(x) & set(y)) 
			j+=1
		i+=1
	a=[]
	b=[]
	for i in range(len(X)):
		asum=0
		for j in range(len(Y)):
			asum+=nij[i,j]
		a.append(asum)
	for j in range(len(Y)):
		bsum=0
		for i in range(len(X)):
			bsum+=nij[i,j]
		b.append(bsum)

	i=0
	for x in X:
		#p = len(x) / n
		j=0
		for y in Y:
			#q = len(y) / n
			r = len(set(x) & set(y)) / n
			if r > 0.0:
				H += r * log(r, 2)
				I += r * log ( r / (a[i]*b[j]/(n*n)) ,2)
			j+=1
		i+=1
	return (-H,I)

