diff --git a/README.md b/README.md deleted file mode 100644 index fa10ed4..0000000 --- a/README.md +++ /dev/null @@ -1,25 +0,0 @@ -pyml is an interactive machine learning program in python 3. It is an experimental tool for automating processes of performing -logistic and linear regressions of datasets in csv files. It is capable of performing linear regression with arbitrary order -polynomial features (the program itself learns exponents for the features). Pyml is also capable of exporting the results -into json files and it keeps always the best result (accuracy), in case of logistic regressions. Apart from computing regressions, -it also allows the user to visualize the data before computations and also shows regression curves after computations are done. -All of this is done interactively in a extremely intuitive manner. - - - -It works on linux systems with python 3.7 installed and uses only a few libraries, all of which are listed in the requirements.txt -file. All one needs to do is clone into its repository - -$git clone https://www.github.com/physicassio/pyml - -cd into the cloned directory, install dependencies - -$pip3 install -r requirements.txt - -and run it - -$python3 pyml.py - -All the fun thereafter is pure intuition. - -Feel free to report issues/bugs, recommend improvements and contribute. diff --git a/analysis.py b/analysis.py deleted file mode 100644 index 0d7a365..0000000 --- a/analysis.py +++ /dev/null @@ -1,293 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt -import numpy as np -import sys -import math -import aux -from termcolor import colored -import random -import visualize as vi -import time - -__author__ = 'Cássio Alves' - -#Disable pandas' alert when inserting arrays in columns -pd.options.mode.chained_assignment = None - -#Function to get columns in selected file -def get_columns(in_file):#,reg_type): - - try: - data = pd.read_csv(in_file,encoding='latin1') - - except: - sys.exit(colored('File '+str(in_file)+' does not seem to be a valid CSV file','red',attrs=['bold'])) - #selecting only numeric columns in selected file - dataint = data[data.columns[data.dtypes=='int64']] - datafloat = data[data.columns[data.dtypes=='float64']] - - #setting a dummy name for row index so the data can be merged based on this index - data.index.name = 'dummy_index' - data = pd.merge(dataint,datafloat,on='dummy_index') - columns = data.columns - - - - print(colored('Numeric columns in file %s'%(in_file),'blue',attrs=['bold'])) - for i in range(len(columns)): - - #+1 so user does not need to type in 0 - print(colored(str(i+1)+'-'+columns[i]+' '+str(data[columns[i]].dtypes),'green',attrs=['bold'])) - - inp = '' - chosen_columns = [] - while (inp != 'q'): - inp = input(colored('Choose one column and press Enter or type in \'0\' to finish selecting. (Type in \'a\' to choose all columns at once) ','white',attrs=['bold'])) - - - if (inp == 'a'): - - chosen_columns = list(columns) - break - - - elif (int(inp) == 0): - break - - elif (int(inp) in range(1,len(columns)+1)): - - data[columns[int(inp)-1]] - chosen_columns.append(columns[int(inp)-1]) - - else: - - print(colored('Invalid column. Please, select a valid column name ','red',attrs=['bold'])) - - - selected_data = data[chosen_columns] - - #Prompting user for what kind of regression to perform - mod = '' - print(colored("1 - Logistic\n2 - Linear",'green',attrs=['bold'])) - mod = int(input(colored("Which type of regression do you want to perform? ",'white',attrs=['bold']))) - while (1>0): - if (mod == 1): - reg_type = 'logistic' - break - elif (mod == 2): - reg_type = 'linear' - break - else: - print(colored("Invalid option. Please, select [1] for logistic regression or [2] for linear regression ",'red',attrs=['underline','bold'])) - mod = int(input(colored("Which type of regression do you want to perform? ",'cyan',attrs=['bold']))) - check_data(selected_data,reg_type) - - print(colored("Chosen columns:",'blue',attrs=['bold'])) - - for i in range(len(chosen_columns)): - print(colored(str(i+1)+'-'+chosen_columns[i],'green',attrs=['bold'])) - - #asks the user for the dependent variable column - target = int(input(colored('Choose the target column ','white',attrs=['bold']))) - tar = selected_data[chosen_columns[target-1]] - target_name = chosen_columns[target-1] - - print(colored("You chose '"+str(chosen_columns[target-1])+"' as your target column ",'yellow',attrs=['bold'])) - - #removing target column from the chosen columns list, so the function can return x and y arrays separately - chosen_columns.pop(target-1) - final_data = pd.DataFrame(selected_data[chosen_columns]) - - check_data(selected_data,reg_type) - #plotting features against target column - plo = '' - plo = str(input(colored("Do you want to visualize your data before analyzing it? [y/n]",'white',attrs=['bold']))) - #mod = int(input(colored("Which type of regression do you want to perform? ",'white',attrs=['bold']))) - while (1>0): - if (plo == 'y'): - vi.main(final_data,tar) - break - elif (plo == 'n'): - #reg_type = 'linear' - break - else: - print(colored("Invalid option. Please, select y or n ",'red',attrs=['underline','bold'])) - plo = str(input(colored("Do you want to visualize your data before analyzing it? [y/n] ",'cyan',attrs=['bold']))) - - - #adding bias column to x array - add_bias(final_data) - chosen_columns.insert(0,'bias') - - return np.array(final_data),np.array(tar),reg_type,chosen_columns,target_name - - -#Function to add bias column to dataframe -def add_bias(data): - size = data.shape[0] - data.insert(0,'bias',np.ones(size)) - -#Function to check if x-columns have NaN values and asks if user wants to replace with column Gaussian distributed values(based in the column properties themselves) -#Might be a good idea to implement replacing with values other the mentioned above -def check_data(data,reg_type): - - for column in data: - col = data[column] - - #variable to calculate how much data is missing - missing = len(col[pd.isna(col)])/len(col) - - if (col.dtypes == np.float64) and (missing != 0.0): - - warning = str(col) + " is missing %.2f%s of its total length (NaN values)"%(missing*100,'%') - new_column = np.array(col) - feed = np.random.normal(col.mean(),col.std(),len(col[pd.isna(col)])) - new_column[(pd.isna(new_column))] = feed - print(colored(warning,'red')) - decision = input(colored("Do you want to fill NaN values with Gaussian distributed values based on the column's properties themselves?(y/n) ",'white',attrs=['bold'])) - while (1>0): - - if (decision == 'y'): - - new_column = np.array(col) - feed = np.random.normal(col.mean(),col.std(),len(col[pd.isna(col)])) - new_column[(pd.isna(new_column))] = feed - data[column] = new_column - break - elif (decision == 'n'): - break - else: - print(colored("Please type in 'y' or 'n'",'white',attrs=['bold'])) - decision = input(colored("Do you want to fill NaN values with Gaussian distributed values based on the column's properties themselves?(y/n) ",attrs=['bold'])) - - #plt.plot(col,np.exp(-((col-col.mean())**2)/(4*col.std()**2)),'ro') - #plt.show() - - - if (reg_type == 'logistic'): - - aux.regularize(data) - -#This is where the fun really begins :) -def logistic(x,y): - - train_x = x[:int(0.7*len(x))] - train_y = y[:int(0.7*len(y))] - - test_x = x[int(0.7*len(x)):] - test_y = y[int(0.7*len(y)):] - - #Reading alpha, number of iterations and lambda - alpha = ' ' - while (type(alpha) is not int) or (type(alpha) is not float): - try: - alpha = float(input(colored("What learning rate do you want to use?(It's recommended values in range [0.0001-1.0]) ",'white',attrs=['bold']))) - break - except KeyboardInterrupt: - sys.exit('Exitting...') - except: - print(colored("The learning rate must be a number",'red',attrs=['bold'])) - n = '' - while (type(n) is not int): - try: - n = int(input(colored("How many iterations do you want to run? ",'white',attrs=['bold']))) - break - except KeyboardInterrupt: - sys.exit('Exitting...') - except: - print(colored("Number of iterations must be a whole number",'red',attrs=['bold'])) - lamb = '' - while (type(lamb) is not float) or (type(lamb) is not int): - try: - lamb = float(input(colored("What regularization parameter(lambda) do you want to use? ",'white',attrs=['bold']))) - break - except KeyboardInterrupt: - sys.exit('Exitting...') - except: - print(colored("lambda must be a number",'red',attrs=['bold'])) - - theta = (np.ones(train_x.shape[1])).T - grad,theta = aux.grad_desc_log(train_x,train_y,theta,alpha,lamb,n) - - - #Checking whether there are any NaN values in weights' array - for value in theta: - if (np.isnan(value)): - sys.exit(colored('Nan values found in weights\' array. Try changing your parameters (e.g. lambda and/or alpha) ','red',attrs=['bold'])) - - #theta = (np.ones(train_x.shape[1])).T - print(colored("Final values for grad="+str(grad)+"\ttheta="+str(theta)+" obtained by using Gradient Descent",'green',attrs=['bold'])) - - #array for plotting sigmoid function to with the learned weights - xp = np.linspace(min(test_x.dot(theta)),max(test_x.dot(theta)),len(test_y)) - result = aux.sig(test_x.dot(theta)) - - #setting threshold for success or failure - result[(result > 0.5 )] = 1 - result[(result <= 0.5)] = 0 - - accu = np.mean(result == test_y) - print(colored("The model predicted %d samples right (out of %d), resulting in an accuracy = %.3f"%(len(result[result == test_y]),len(test_y),accu),'white',attrs=['bold'])) - - #plots for comparison - #vi.visualize_result(x,y,cols) - plt.plot(xp,aux.sig(xp),label = 'sigmoid function') #sigmoid function - plt.plot(test_x.dot(theta),test_y,'ro',label = 'testing set') #original data - plt.plot(test_x.dot(theta),result,'b+',label = 'predictions')#data using learned weights - plt.legend() - plt.show() - return(list(theta),accu) - -def linear(x,y,cols): - - #Reading alpha and number of iterations - alpha = ' ' - while (type(alpha) is not int) or (type(alpha) is not float): - try: - alpha = float(input(colored("What learning rate do you want to use?(It's recommended values in range [0.0001-1.0]) ",'white',attrs=['bold']))) - break - except KeyboardInterrupt: - sys.exit('Exitting...') - except: - print(colored("The learning rate must be a number",'red',attrs=['bold'])) - n = '' - while (type(n) is not int): - try: - n = int(input(colored("How many iterations do you want to run? ",'white',attrs=['bold']))) - break - except KeyboardInterrupt: - sys.exit('Exitting...') - except: - print(colored("Number of iterations must be a whole number",'red',attrs=['bold'])) - - #getting exponents for linear regression, in case there is any non-linear polinomial feature - exp,coe = aux.get_expo(pd.DataFrame(x).drop(0,1),y) - - exp.insert(0,1) - - for i in range(len(exp)): - - if exp[i] > 1.0: - x[:,i] = x[:,i]**exp[i] - - theta = 0*(np.ones(x.shape[1])).T - - grad,theta = aux.grad_desc_linear(x,y,theta,alpha,n,exp) - - #Checking whether there are any NaN values in weights' array - for value in theta: - if (np.isnan(value)): - sys.exit(colored('Nan values found in weights\' array. Try changing your parameters (e.g. lambda and/or alpha) ','red',attrs=['bold'])) - - print(colored("Final values for grad="+str(grad)+"\ttheta="+str([theta[i]**(1./exp[i]) for i in range(len(theta))])+" obtained by using Gradient Descent",'green',attrs=['bold'])) - - #Computing theta using the normal equation - xtx = np.linalg.inv((x.T).dot(x)) - normal_theta = xtx.dot((x.T).dot(y)) - print(colored("Final value for theta using the Normal Equation\t"+str([normal_theta[i]**(1./exp[i]) for i in range(len(normal_theta))]),'green',attrs=['bold'])) - - vi.visualize_result(theta,x,y,exp,cols) - return(list(theta)) - -if __name__ == '__main__': - main() diff --git a/aux.py b/aux.py deleted file mode 100644 index a6992a9..0000000 --- a/aux.py +++ /dev/null @@ -1,140 +0,0 @@ -__author__ = 'Cássio Alves' -import sys -import math -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from termcolor import colored -import json -#import time - -#function to regularize data when logistic regression is selected. -#Might be a good idea to use for linear regression as well. -def regularize(matrix): - for column in matrix.columns: - #regularizes all columns with standard deviation greater than 1(not really sure if 1 is a good value) - try: - - st_dev = matrix[column].describe()['std'] - if (st_dev > 1.0): - matrix[column] = (1./st_dev)*(matrix[column] - matrix[column].mean()) - - - except KeyError: - continue -#Sigmoid function -def sig(x): - return np.array(1./(1+np.exp(-x))) - -#Function for calculation cross-entropy for logistic regression -def cost_log(x,y,theta,lamb): - - m = len(y) - g = sig(np.dot(x,theta)) - tt = theta #array for taking into account regularization - tt[0] = 0 #(recall that the second sum in cross entropy starts at j =1) - cost = 0.0 - cost = (2./m)*sum(-y*np.log(g)-(1-y)*np.log(1-g))+(lamb/m)*(tt*tt)#+(0.5*lamb/m)*(tt/np.abs(tt)) - #r = (lamb/m)*sum(tt*tt) - #print(r) - #for i in range(m): - # if (y[i] == 0): - # cost+=(1./m)*np.log(1-g[i])+r#+(0.5*lamb/m)*np.dot(tt,tt)#[i]**2#(np.dot(tt,tt))#+sum(abs(tt))) - # else: - # cost+=(1./m)*np.log(g[i])+r#+(0.5*lamb/m)*np.dot(tt,tt)#[i]**2#(np.dot(tt,tt))#+sum(abs(tt))) - #print("cost=",cost) - return cost - -#Function for calculating cost function for linear regression -def cost_linear(x,y,theta): - - m = len(y) - h = np.dot(x,theta) - cost = (1./m)*(sum((h-y).dot(x)))**2 - return cost - -#Gradient descent for linear regression -def grad_desc_linear(x,y,theta,alpha,n_iter,exp): - - m = len(y) - c = [] - for i in range(n_iter+1): - - h = x.dot(theta) - grad = (2./m)*np.dot((h-y).T,x) - theta = theta - alpha*grad - c.append(cost_linear(x,y,theta)) - if i%100 ==0: - - print(colored("grad="+str(grad)+"\ttheta="+str([theta[i]**(1./exp[i]) for i in range(len(theta))])+" in the "+str(i)+"th iteration",'green')) - - plt.plot(range(len(c)),np.array(c),'--') - plt.ylabel('Cost Function') - plt.xlabel('Iterations') - plt.show() - return(grad,theta) - -#Gradient descent for logistic regression -def grad_desc_log(x,y,theta,alpha,lamb,n_iter): - - m = len(y) - #print(x) - c = [] - cost = [] - for i in range(int(n_iter+1)): - g = sig(x.dot(theta)) - - tt = np.array(list(theta)) - #print(theta) - tt[0] = 0 - - - grad = (1./m)*(np.dot((g-y).T,x))+(lamb/m)*(tt.T)#+(0.5*lamb/m)*(tt/np.sqrt(tt)) - theta = theta - alpha*grad - #print(theta) - #sys.exit() - #c.append(np.dot(grad,grad)) #Useful for checking gradient convergence - c.append(cost_log(x,y,theta,lamb)) - if i%100 ==0: - print(colored("grad="+str(grad)+"\ttheta="+str(theta)+" in the "+str(i)+"th iteration",'green')) - plt.plot(range(len(c)),np.array(c),'--') - plt.ylabel('Cost Function') - plt.xlabel('Iterations') - plt.show() - return (grad,theta) - -#function to get features' exponents (generalize to arbitrary order polinomials) -def get_expo(x,y): - n = len(y) - expos = [] - coeffs = [] - Y = np.log10(y) - for column in x.columns: - column = np.log10(x[column]) - den = n*np.dot(column,column)-sum(column)**2 - expo = (n*np.dot(column,Y)-sum(column)*sum(Y))/den - coef = (sum(y)*np.dot(column,column)-sum(column)*np.dot(y,column))/den - #down = n*np.dot(column,column)-sum(column)**2#sum(column) - expos.append(np.round(expo,2)) - coeffs.append(np.round(10**coef,2)) - return expos,coeffs - -#function to export results to a json file -def export_json(fil,dic): - open(fil,'w').write(json.dumps(dic)) - -#function to look for past results for logistic regression performed on the same data file -#this function is called in pyml.py and if the past accuracy is greater than the current one, the current results are dismissed. -def check_past_result(fil): - - try: - past_results = json.load(open(fil,'r')) - return(past_results['accuracy']) - - - except FileNotFoundError: - return(-1) - - - - diff --git a/clean.py b/clean.py deleted file mode 100644 index f984674..0000000 --- a/clean.py +++ /dev/null @@ -1,84 +0,0 @@ -import numpy as np -import sys -import os - -def clean_file(lines,in_file): - new_file_name = 'clean_'+in_file - new_file = open(new_file_name,'wb') - for line in lines: - try: - line.decode('utf-8') - new_file.write(line) - except UnicodeDecodeError: - i = 0 - l = [] - while (i0): - - if (decision == 'y'): - #print(a.readli) - clean_file(open(in_file,'rb').readlines(),in_file) - break - elif (decision == 'n'): - break - else: - decision = input("Invalid option. Please type in 'y' or 'n' ") - break - #break - - -def write_bin_file(fil,content): - - #try: - #text_content = ''.join([chr(x) for x in content]) - #ontent = np.array(content) - file_size = os.path.getsize(fil) - buff = open(fil,'r') - - - - #for char in content: - # if chr(char).decode('utf-8'): - # continue - # else: - # print(char) - new_bin_file = open('clean_'+str(fil),'wb').write(bytearray(content)) - #except: - # sys.exit('Crap') - -def main(in_file): - check_file(in_file) - -if __name__ == '__main__': - main(input('Type in the file name to check ')) -#in_file(input('Type in the file name to check ')) -#print(content[content > 130]) -#print(chr(0xa0))#.encode('utf-8')) - diff --git a/images/pymlfin.png b/images/pymlfin.png new file mode 100644 index 0000000..3eb05a9 Binary files /dev/null and b/images/pymlfin.png differ diff --git a/pyml.py b/pyml.py deleted file mode 100644 index db153bf..0000000 --- a/pyml.py +++ /dev/null @@ -1,143 +0,0 @@ -import sys -import math -import pandas as pd -import numpy as np -import os -import subprocess as sp -import glob -from termcolor import colored -from colorama import Fore, Back, Style -import analysis as an -import aux -import matplotlib.pyplot as plt -import json - -__author__ = "Cássio Alves" -__version__ = "1.0[beta]" - -version_string = colored(str(__version__),'red') - - -print(colored("\t _",'yellow',attrs=['dark','bold'])) -print(colored("\t _ __ _ _ _ __ ___ | |",'yellow',attrs=['dark','bold'])) -print(colored("\t| '_ \| | | | '_ ` _ \| |",'yellow',attrs=['dark','bold'])) -print(colored("\t| |_) | |_| | | | | | | |",'yellow',attrs=['dark','bold'])) -print(colored("\t| .__/ \__, |_| |_| |_|_|",'yellow',attrs=['dark','bold'])) -print(colored("\t|_| |___/\t" +version_string,'yellow',attrs=['dark','bold'])) - -#Function to display files in a directory -def get_files(directory): - - while (1>0): - if (os.path.exists(directory)): - break - else : - directory = input(colored("Invalid path. Please, provide a valid path ",'red',attrs=['bold'])) - if (directory[-1] != '/'): - directory+='/' - file_list = glob.glob(directory+'*') - file_list.insert(0,'offset') - print(colored("Files in directory"+str(directory),'blue',attrs=['bold'])) - - #Printing files in the selected directory for user to choose - for i in range(1,len(file_list)): - fil = file_list[i] - if os.path.isdir(fil): - print(colored(str(i)+'-'+file_list[i]+' (dir)','blue',attrs=['bold'])) - else: - print(colored(str(i)+'-'+file_list[i],'green',attrs=['bold'])) - - while (1>0) or (os.path.isdir(chosen_file)): - - try: - chosen_file = file_list[int(input(colored('Which one do you want to open? ','white',attrs=['bold'])))] - if chosen_file == 'offset': - raise IndexError - elif (os.path.isdir(chosen_file)): - chosen_file = get_files(chosen_file) - break - else: - return chosen_file - break - except IndexError: - - print(colored("Please, type in a whole number in the range 1-"+str(len(file_list)-1),'red',attrs=['bold'])) - except KeyboardInterrupt: - sys.exit('Exitting...') - - return chosen_file - -#Function to open a check if the selected file exits and/or if it is a directory -def open_file(file_path): - - try: - - if os.path.isdir(file_path): - answer = input(colored('The selected option is a directory. Do you want to list its contents?(y/n) ','white',attrs=['bold'])) - if (answer == 'y'): - - return file_path - - else: - sys.exit('Exitting...') - - else: - return(file_path) - except FileNotFoundError: - sys.exit('File not found') - -def main(): - - #Prompting user for directory to list contents and file to open - chosen_file = get_files(input(colored('\nType in dir path for listing files in it ','white',attrs=['bold']))) - - #Prompting user for what kind of regression to perform - x,y,regression_type,cols,target = an.get_columns(chosen_file) - - result_files = chosen_file.rstrip('.csv') + '_results' - - if (regression_type == 'logistic'): - - theta,accuracy = an.logistic(x,y) - - #checking for past results and exporting the current ones if the accuracy is greater than the past one - #in case the past accuracy is better than current one, current results are dismissed - past_accu = aux.check_past_result(result_files+'.json') - if (accuracy > past_accu): - output_dict = {'theta':theta,'columns':cols,'accuracy':accuracy} - aux.export_json(result_files+'.json',output_dict) - - else: - print(colored('Your previous result had %.3f accuracy and the current %.3f. Current one will not be saved'%(past_accu,accuracy),'yellow',attrs=['bold'])) - - else: - theta = an.linear(x,y,cols) - output_dict = {'theta':theta,'columns':cols} - aux.export_json(result_files+'.json',output_dict) - - test_hip = input(colored('Do you want to apply your model to a different dataset? ','green',attrs=['bold'])) - - if (test_hip == 'y'): - - test_file = get_files(os.path.dirname(chosen_file)) - test_data = pd.read_csv(test_file)[cols[1:]] - an.add_bias(test_data) - an.check_data(test_data,regression_type) - - if (regression_type == 'logistic'): - res = aux.sig(np.dot(test_data,theta)) - res[res > 0.5 ] = 1 - res[res <= 0.5] = 0 - else: - res = np.dot(x,theta) - res_file = open(result_files+'_log_reg.csv','w') - res_file.write(target+'\n') - res_file.write('\n'.join([str(x) for x in res])) - res_file.close() - - - - -if __name__ == '__main__': - main() - diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 55fd601..0000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -pandas==0.25.1 -numpy==1.14.5 -matplotlib==2.2.2 -termcolor==1.1.0 -geojson==2.4.0 -jsonschema==2.6.0 diff --git a/visualize.py b/visualize.py deleted file mode 100644 index fa92d74..0000000 --- a/visualize.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from termcolor import colored -import pandas as pd -import sys -from mpl_toolkits.mplot3d import Axes3D - -def main(columns,target): - - print(target.name) - fig = plt.figure() - qtd = columns.shape[1] - n_rows = np.ceil(qtd/2) - print(n_rows) - - for column in columns: - - col_ind = columns.columns.get_loc(column) - - ax = fig.add_subplot(n_rows,2,col_ind+1) - ax.plot(columns[column],target,'ro') - ax.set_ylabel(target.name) - ax.set_xlabel(column) - - mng = plt.get_current_fig_manager() - mng.resize(*mng.window.maxsize()) - plt.show() - input(colored('Press Enter to continue...',attrs=['bold'])) - -def visualize_result(theta,x,target,exp,cols): - - -# for column in columns: -# plt.plot(columns[column]**(1./exp[column]),target,'o') -# plt.plot(columns[column]**(1./exp[column]),theta[0]+(theta[column]*columns[column]),label='%.3f x_%s + %.3f'%(theta[column]**(1./exp[column]),column,theta[0])) -# plt.xlabel(cols[column])""" - #print(columns.shape[1]) - for column in range(x.shape[1]): - - plt.plot(x[:,column]**(1./exp[column]),target,'o') - plt.plot(x[:,column]**(1./exp[column]),theta[0]+(theta[column]*x[:,column]),label='%.3f x_%s + %.3f'%(theta[column]**(1./exp[column]),column,theta[0])) - plt.xlabel(cols[column]) - plt.legend() - plt.show() - -if __name__ == '__main__': - main(columns,target)