This repository was archived by the owner on Mar 8, 2020. It is now read-only.
This repository was archived by the owner on Mar 8, 2020. It is now read-only.
[feature request] add position information #108
Open
Description
Hi,
I found that python-driver has lack of position information for several types of tokens.
import bblfsh
client = bblfsh.BblfshClient("0.0.0.0:9432")
file_loc = "location/of/file.py"
# read content
with open(file_loc, "r") as f:
content = f.read()
# extract uast
uast = client.parse(file_loc).uast
# select nodes with tokens and sort them by position
nodes = []
for node in bblfsh.iterator(uast, bblfsh.TreeOrder.PRE_ORDER):
if node.token:
nodes.append(node)
nodes = list(sorted(nodes, key=lambda n: n.start_position.offset))
# print token position, token, select source by position information
for n in nodes:
print(n.start_position.offset, n.token,
content[n.start_position.offset:n.start_position.offset + len(n.token)],
content[n.start_position.offset:n.end_position.offset + 1],
sep="|")
The source code I used is in details
import argparse
import os
import tempfile
import unittest
import sourced.ml.tests.models as paths
from sourced.ml.models import Topics
from sourced.ml.cmd import bigartm2asdf
class TopicsTests(unittest.TestCase):
def setUp(self):
self.model = Topics().load(source=paths.TOPICS)
def test_dump(self):
res = self.model.dump()
self.assertEqual(res, """320 topics, 1000 tokens
First 10 tokens: ['ulcancel', 'domainlin', 'trudi', 'fncreateinstancedbaselin', 'wbnz', 'lmultiplicand', 'otronumero', 'qxln', 'gvgq', 'polaroidish']
Topics: unlabeled
non-zero elements: 6211 (0.019409)""") # noqa
def test_props(self):
self.assertEqual(len(self.model), 320)
self.assertEqual(len(self.model.tokens), 1000)
self.assertIsNone(self.model.topics)
zt = self.model[0]
self.assertEqual(len(zt), 8)
self.assertEqual(zt[0][0], "olcustom")
self.assertAlmostEqual(zt[0][1], 1.23752e-06, 6)
def test_label(self):
with self.assertRaises(ValueError):
self.model.label_topics([1, 2, 3])
with self.assertRaises(TypeError):
self.model.label_topics(list(range(320)))
self.model.label_topics([str(i) for i in range(320)])
self.assertEqual(self.model.topics[0], "0")
def test_save(self):
with tempfile.NamedTemporaryFile(prefix="sourced.ml-topics-test-") as f:
self.model.save(f.name)
new = Topics().load(f.name)
self.assertEqual(self.model.tokens, new.tokens)
self.assertEqual((self.model.matrix != new.matrix).getnnz(), 0)
def test_bigartm2asdf(self):
with tempfile.NamedTemporaryFile(prefix="sourced.ml-topics-test-") as f:
args = argparse.Namespace(
input=os.path.join(os.path.dirname(__file__), paths.TOPICS_SRC),
output=f.name)
bigartm2asdf(args)
model = Topics().load(f.name)
self.assertEqual(len(model), 320)
self.assertEqual(len(model.tokens), 1000)
if __name__ == "__main__":
unittest.main()
As result we may notice seral tokens without position information:
0|argparse|import a|i
0|os|im|i
0|tempfile|import a|i
0|unittest|import a|i
0|sourced.ml.tests.models|import argparse
import |i
0|paths|impor|i
0|sourced.ml.models|import argparse
i|i
0|Topics|import|i
0|sourced.ml.cmd|import argpars|i
0|bigartm2asdf|import argpa|i
0|source|import|i
0|!=|im|i
0|prefix|import|i
0|input|impor|i
0|output|import|i
0|prefix|import|i
0|==|im|i
184|TopicsTests|TopicsTests|TopicsTests
some of them are imports like
0|argparse|import a|i
0|os|im|i
some operators
0|==|im|i
0|!=|im|i
some arguments
0|source|import|i
0|prefix|import|i
0|input|impor|i
0|output|import|i
0|prefix|import|i