Skip to content

Booster.predict returns same score for any distinct instance when executed via JEP #2500

Closed
@ManuelMourato25

Description

I am trying to call LightGBM via JEP (Java Embedded Python), in order to predict the scores of a couple of records.
However, if I execute the Booster.predict command via JEP, it returns a constant score of 0.04742587, for every distinct record passed.
The same does not happen if I invoke LGBM from a Python script.
Any ideas on what the issue might be?
Note: when using JEP to invoke different models, like xgboost, this issue does not happen.

Environment info

Operating System: Ubuntu 18.04

CPU/GPU model: Running on local machine with 8 cores / 32 GB RAM

Python version: 3.6.9
GCC version: 7.4.0
Java version: 1.8
Jep version: 3.7.1

LightGBM version or commit hash: 2.3.1

Steps to reproduce

COMMON STEPS:

  1. Extract the following model inside the zip into a file named m0_test.model:

m0_test.zip

  1. Save the following script as classifier.py, in any folder you wish (replace <PATH_TO_MODEL> accordingly)
import lightgbm as lgb
import numpy as np
import pandas as pd
import random as rd


class Classifier:
    def __init__(self):
        # Necessary file paths
        model_file_name = '<PATH_TO_MODEL>/m0_test.model'

        # Load model.
        self.model = Classifier.get_model(model_file_name)

    def getClassDistribution(self, instances):
        score = self.get_transaction_score(instances)

        print('Score:'+str(score))

        return [np.array([score, 1 - score])]

    def get_transaction_score(self, instances):
        gbm=self.model
        # Get prediction.
        fraud_prediction = gbm.predict(instances, num_iteration=gbm.best_iteration)
        return fraud_prediction.flatten()

    @staticmethod
    def get_model(model_name):
        with open(model_name, 'r') as file:
            data = file.read()
            lgb_model = lgb.Booster(model_str=data)
            return lgb_model

JEP EXECUTION:

  1. Install JEP
git clone https://github.com/ninia/jep.git 
cd jep 
echo "Installing JEP..." 
git checkout v3.7.1 
python setup.py build install 
  1. Set the following environment variables (replace variables inside <> accordingly):
LD_LIBRARY_PATH=<PATH_TO_JEP>/lib/python3.6/site-packages/jep
LD_PRELOAD=<PATH_TO_PYTHON_SO>/lib/libpython3.6m.so
JEP_LOCATION=<PATH_TO_JEP>/lib/python3.6/site-packages/jep
JEP_JAR =${JEP_LOCATION}/jep-3.7.0.jar
TO_PRELOAD=${LD_PRELOAD}
  1. Compile and run the following Java Class (replace <PATH_TO_FOLDER_CONTAINING_CLASSIFIER.PY> accordingly):
package com.feedzai.tests;
import jep.Jep;

public class TestLightGBM {

    public static void main(String[] args) {

        try  {

            Jep jep= new Jep();
            jep.eval("import sys");
           jep.eval("from java.lang import System");
            jep.eval("sys.path.append(\"<PATH_TO_FOLDER_CONTAINING_CLASSIFIER.PY>\")");
            jep.eval("from classifier import Classifier");

            jep.eval("import numpy as np");

            jep.eval("record1 = np.array([[ 4.61935575 ,-5.18927169,  2.74834851,  1.0087401 ,  1.95090556, -3.33563201,"
                   + "1.  ,        2.     ,     2.     ,     2.        ]]) ");
            jep.eval("record2 = np.array([[2.30000000e+01, 2.60000000e+02, 1.00000000e+06, 2.29400000e+02,"
                    +"2.30209137e+07 ,1.09000000e+04 ,2.00000000e+00 ,2.00000000e+00,"
                    +"1.00000000e+00, 2.00000000e+00]])");
            jep.eval( "record3 = np.array([[2.20000000e+01, 2.37000000e+02 ,1.00000100e+06 ,3.50400000e+02,"
                   + "1.90109777e+07, 1.00320000e+04, 1.00000000e+00, 3.00000000e+00,"
                 +   "2.00000000e+00, 5.00000000e+00]])");

            jep.eval( "record4 = np.array([[9.80000000e+01, 2.57000000e+02, 1.00000200e+06, 3.33400000e+02,"
                    +"1.41323727e+07, 1.19990000e+04, 0.00000000e+00, 0.00000000e+00,"
                   +" 2.00000000e+00, 3.00000000e+00]])");
            jep.eval( "record5 = np.array([[1.30000000e+01, 3.17000000e+02, 1.00000300e+06, 6.99400000e+02,"
                   + "3.30892917e+07, 1.03560000e+04, 2.00000000e+00, 2.00000000e+00,"
                   +" 1.00000000e+00, 1.00000000e+00]])");


            jep.eval("test = Classifier()");
            jep.eval("test.getClassDistribution(record1)[0].item(0)");
            jep.eval("test.getClassDistribution(record2)[0].item(0)");
            jep.eval("test.getClassDistribution(record3)[0].item(0)");
            jep.eval("test.getClassDistribution(record4)[0].item(0)");
            jep.eval("test.getClassDistribution(record5)[0].item(0)");

        }
        catch(Exception e){
            System.out.println("Error");
            System.out.println(e);

        }
    }



}

  1. See that the return score is always constant independent of the record classified:
Finished loading model, total used 200 iterations
Score:[0.04742587]
0.04742587317756678
Score:[0.04742587]
0.04742587317756678
Score:[0.04742587]
0.04742587317756678
Score:[0.04742587]
0.04742587317756678
Score:[0.04742587]
0.04742587317756678

PYTHON EXECUTION:

  1. Open the Python console
  2. Type the folowing code, equivalent to the example above. (replace <PATH_TO_FOLDER_CONTAINING_CLASSIFIER.PY> accordingly ):
import sys
sys.path.append("<PATH_TO_FOLDER_CONTAINING_CLASSIFIER.PY>") # ex: /opt/folder/
from classifier import Classifier
import numpy as np
record1 = np.array([[ 4.61935575 ,-5.18927169,  2.74834851,  1.0087401 ,  1.95090556, -3.33563201,
   1.  ,        2.     ,     2.     ,     2.        ]])
record2 = np.array([[2.30000000e+01, 2.60000000e+02, 1.00000000e+06, 2.29400000e+02,
  2.30209137e+07 ,1.09000000e+04 ,2.00000000e+00 ,2.00000000e+00,
  1.00000000e+00, 2.00000000e+00]])
record3 = np.array([[2.20000000e+01, 2.37000000e+02 ,1.00000100e+06 ,3.50400000e+02,
  1.90109777e+07, 1.00320000e+04, 1.00000000e+00, 3.00000000e+00,
  2.00000000e+00, 5.00000000e+00]])
record4 = np.array([[9.80000000e+01, 2.57000000e+02, 1.00000200e+06, 3.33400000e+02,
  1.41323727e+07, 1.19990000e+04, 0.00000000e+00, 0.00000000e+00,
  2.00000000e+00, 3.00000000e+00]])
record5 = np.array([[1.30000000e+01, 3.17000000e+02, 1.00000300e+06, 6.99400000e+02,
  3.30892917e+07, 1.03560000e+04, 2.00000000e+00, 2.00000000e+00,
  1.00000000e+00, 1.00000000e+00]])

test = Classifier()
print(test.getClassDistribution(record1)[0].item(0))
print(test.getClassDistribution(record2)[0].item(0))
print(test.getClassDistribution(record3)[0].item(0))
print(test.getClassDistribution(record4)[0].item(0))
print(test.getClassDistribution(record5)[0].item(0))

  1. The instances are now correctly scored:
>>> print(test.getClassDistribution(record1)[0].item(0))
Score:[1.07931287e-09]
1.0793128698291579e-09
>>> print(test.getClassDistribution(record2)[0].item(0))
Score:[1.25450336e-09]
1.2545033620809255e-09
>>> print(test.getClassDistribution(record3)[0].item(0))
Score:[1.1813487e-09]
1.1813487042059903e-09
>>> print(test.getClassDistribution(record4)[0].item(0))
Score:[1.1813487e-09]
1.1813487042058477e-09
>>> print(test.getClassDistribution(record5)[0].item(0))
Score:[9.23272512e-10]
9.232725116775672e-10

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions