Skip to content
This repository was archived by the owner on Mar 8, 2020. It is now read-only.
This repository was archived by the owner on Mar 8, 2020. It is now read-only.

[feature request] add position information #108

Open
@EgorBu

Description

@EgorBu

Hi,
I found that python-driver has lack of position information for several types of tokens.

import bblfsh

client = bblfsh.BblfshClient("0.0.0.0:9432")
file_loc = "location/of/file.py"

# read content
with open(file_loc, "r") as f:
    content = f.read()

# extract uast
uast = client.parse(file_loc).uast

# select nodes with tokens and sort them by position
nodes = []
for node in bblfsh.iterator(uast, bblfsh.TreeOrder.PRE_ORDER):
    if node.token:
        nodes.append(node)
nodes = list(sorted(nodes, key=lambda n: n.start_position.offset))

# print token position, token, select source by position information
for n in nodes:
    print(n.start_position.offset, n.token,
              content[n.start_position.offset:n.start_position.offset + len(n.token)],
              content[n.start_position.offset:n.end_position.offset + 1],
              sep="|")

The source code I used is in details

import argparse
import os
import tempfile
import unittest

import sourced.ml.tests.models as paths
from sourced.ml.models import Topics
from sourced.ml.cmd import bigartm2asdf


class TopicsTests(unittest.TestCase):
    def setUp(self):
        self.model = Topics().load(source=paths.TOPICS)

    def test_dump(self):
        res = self.model.dump()
        self.assertEqual(res, """320 topics, 1000 tokens
First 10 tokens: ['ulcancel', 'domainlin', 'trudi', 'fncreateinstancedbaselin', 'wbnz', 'lmultiplicand', 'otronumero', 'qxln', 'gvgq', 'polaroidish']
Topics: unlabeled
non-zero elements: 6211  (0.019409)""")  # noqa

    def test_props(self):
        self.assertEqual(len(self.model), 320)
        self.assertEqual(len(self.model.tokens), 1000)
        self.assertIsNone(self.model.topics)
        zt = self.model[0]
        self.assertEqual(len(zt), 8)
        self.assertEqual(zt[0][0], "olcustom")
        self.assertAlmostEqual(zt[0][1], 1.23752e-06, 6)

    def test_label(self):
        with self.assertRaises(ValueError):
            self.model.label_topics([1, 2, 3])
        with self.assertRaises(TypeError):
            self.model.label_topics(list(range(320)))
        self.model.label_topics([str(i) for i in range(320)])
        self.assertEqual(self.model.topics[0], "0")

    def test_save(self):
        with tempfile.NamedTemporaryFile(prefix="sourced.ml-topics-test-") as f:
            self.model.save(f.name)
            new = Topics().load(f.name)
            self.assertEqual(self.model.tokens, new.tokens)
            self.assertEqual((self.model.matrix != new.matrix).getnnz(), 0)

    def test_bigartm2asdf(self):
        with tempfile.NamedTemporaryFile(prefix="sourced.ml-topics-test-") as f:
            args = argparse.Namespace(
                input=os.path.join(os.path.dirname(__file__), paths.TOPICS_SRC),
                output=f.name)
            bigartm2asdf(args)
            model = Topics().load(f.name)
            self.assertEqual(len(model), 320)
            self.assertEqual(len(model.tokens), 1000)


if __name__ == "__main__":
    unittest.main()

As result we may notice seral tokens without position information:

0|argparse|import a|i
0|os|im|i
0|tempfile|import a|i
0|unittest|import a|i
0|sourced.ml.tests.models|import argparse
import |i
0|paths|impor|i
0|sourced.ml.models|import argparse
i|i
0|Topics|import|i
0|sourced.ml.cmd|import argpars|i
0|bigartm2asdf|import argpa|i
0|source|import|i
0|!=|im|i
0|prefix|import|i
0|input|impor|i
0|output|import|i
0|prefix|import|i
0|==|im|i
184|TopicsTests|TopicsTests|TopicsTests

some of them are imports like

0|argparse|import a|i
0|os|im|i

some operators

0|==|im|i
0|!=|im|i

some arguments

0|source|import|i
0|prefix|import|i
0|input|impor|i
0|output|import|i
0|prefix|import|i

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions