database: Add support for Kconfig et Devicetree files

This brings the idea of file families, each ident is identified by his family and can be referenced in compatible families. For exemple : - A Kconfig ident can be referenced in a C file but not in a Devicetree file. - A Devicetree ident is only referenced in Devicetree files. - A C ident is only referenced in C files. Kconfig idents are defined without the CONFIG_ at the beginning. We add it while indexing for an easier processing of other files. Signed-off-by: Maxime Chretien <maxime.chretien@bootlin.com>
bootlin · May 22, 2020 · 72571fb · 72571fb
1 parent bdcb386
commit 72571fb
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 31 deletions.
diff --git a/data.py b/data.py
@@ -29,6 +29,7 @@
 ##################################################################################
 
 defTypeR = {
+    'c': 'config',
     'd': 'define',
     'e': 'enum',
     'E': 'enumerator',
@@ -50,31 +51,43 @@
 
 class DefList:
     '''Stores associations between a blob ID, a type (e.g., "function"),
-        and a line number.'''
-    def __init__(self, data=b''):
-        self.data = data
+        a line number and a file family.
+        Also stores in which families the ident exists for faster tests.'''
+    def __init__(self, data=b'#'):
+        self.data, self.families = data.split(b'#')
 
     def iter(self, dummy=False):
         for p in self.data.split(b','):
-            p = re.search(b'(\d*)(\w)(\d*)', p)
-            id, type, line = p.groups()
+            p = re.search(b'(\d*)(\w)(\d*)(\w)', p)
+            id, type, line, family = p.groups()
             id = int(id)
             type = defTypeR [type.decode()]
             line = int(line)
-            yield(id, type, line)
+            family = family.decode()
+            yield(id, type, line, family)
         if dummy:
-            yield(maxId, None, None)
+            yield(maxId, None, None, None)
 
-    def append(self, id, type, line):
+    def append(self, id, type, line, family):
         if type not in defTypeD:
             return
-        p = str(id) + defTypeD[type] + str(line)
+        p = str(id) + defTypeD[type] + str(line) + family
         if self.data != b'':
             p = ',' + p
         self.data += p.encode()
 
     def pack(self):
-        return self.data
+        return self.data + b'#' + self.families
+
+    def add_family(self, family):
+        family = family.encode()
+        if not family in self.families.split(b','):
+            if self.families != b'':
+                family = b',' + family
+            self.families += family
+
+    def get_families(self):
+        return self.families.decode().split(',')
 
 class PathList:
     '''Stores associations between a blob ID and a file path.
@@ -100,7 +113,8 @@ def pack(self):
         return self.data
 
 class RefList:
-    '''Stores a mapping from blob ID to list of lines.'''
+    '''Stores a mapping from blob ID to list of lines 
+        and the corresponding family.'''
     def __init__(self, data=b''):
         self.data = data
 
@@ -110,16 +124,17 @@ def iter(self, dummy=False):
         while s.tell() < size:
             line = s.readline()
             line = line [:-1]
-            b,c = line.split(b':')
+            b,c,d = line.split(b':')
             b = int(b.decode())
             c = c.decode()
-            yield(b, c)
+            d = d.decode()
+            yield(b, c, d)
         s.close()
         if dummy:
-            yield(maxId, None)
+            yield(maxId, None, None)
 
-    def append(self, id, lines):
-        p = str(id) + ':' + lines + '\n'
+    def append(self, id, lines, family):
+        p = str(id) + ':' + lines + ':' + family + '\n'
         self.data += p.encode()
 
     def pack(self):

diff --git a/lib.py b/lib.py
@@ -183,6 +183,28 @@ def getDataDir():
 def currentProject():
     return os.path.basename(os.path.dirname(getDataDir()))
 
-def hasSupportedExt(filename):
-    ext = os.path.splitext(filename)[1]
-    return ext.lower() in ['.c', '.cc', '.cpp', '.c++', '.cxx', '.h', '.s']
+def getFileFamily(filename):
+    name, ext = os.path.splitext(filename)
+
+    if ext.lower() in ['.c', '.cc', '.cpp', '.c++', '.cxx', '.h', '.s'] :
+        return 'C' # C file family and ASM
+    elif ext.lower() in ['.dts', '.dtsi'] :
+        return 'D' # Devicetree files
+    elif name.lower()[:7] in ['kconfig'] and not ext.lower() in ['.rst']:
+        # Some files are named like Kconfig-nommu so we only check the first 7 letters
+        # We also exclude documentation files that can be named kconfig
+        return 'K' # Kconfig files
+    else :
+        return None
+
+compatibility_list = {
+    'C' : ['C', 'K'],
+    'K' : ['K'],
+    'D' : ['D']
+}
+
+# Check if families are compatible
+# First argument can be a list of different families
+# Second argument is the key for chossing the right array in the compatibility list
+def compatibleFamily(file_family, requested_family):
+    return any(item in file_family for item in compatibility_list[requested_family])
diff --git a/script.sh b/script.sh
@@ -101,9 +101,15 @@ tokenize_file()
         ref="$v:`denormalize $opt2`"
     fi
 
+    if [ $opt3 = "D" ]; then #Don't cut around '-' in devicetrees
+        regex='s%((/\*.*?\*/|//.*?\001|[^'"'"']"(\\.|.)*?"|# *include *<.*?>|[^\w-])+)([\w-]+)?%\1\n\4\n%g'
+    else
+        regex='s%((/\*.*?\*/|//.*?\001|[^'"'"']"(\\.|.)*?"|# *include *<.*?>|\W)+)(\w+)?%\1\n\4\n%g'
+    fi
+
     git cat-file blob $ref 2>/dev/null |
     tr '\n' '\1' |
-    perl -pe 's%((/\*.*?\*/|//.*?\001|[^'"'"']"(\\.|.)*?"|# *include *<.*?>|\W)+)(\w+)?%\1\n\4\n%g' |
+    perl -pe "$regex" |
     head -n -1
 }
 
@@ -136,12 +142,49 @@ untokenize()
 }
 
 parse_defs()
+{
+    case $opt3 in
+    "C")
+        parse_defs_C
+        ;;
+    "K")
+        parse_defs_K
+        ;;
+    "D")
+        parse_defs_D
+        ;;
+    esac
+}
+
+parse_defs_C()
+{
+    tmp=`mktemp -d`
+    full_path=$tmp/$opt2
+    git cat-file blob "$opt1" > "$full_path"
+    ctags -x --kinds-c=+p-m "$full_path" |
+    grep -avE "^operator |CONFIG_" |
+    awk '{print $1" "$2" "$3}'
+    rm "$full_path"
+    rmdir $tmp
+}
+
+parse_defs_K()
+{
+    tmp=`mktemp -d`
+    full_path=$tmp/$opt2
+    git cat-file blob "$opt1" > "$full_path"
+    ctags -x --language-force=kconfig "$full_path" |
+    awk '{print "CONFIG_"$1" "$2" "$3}'
+    rm "$full_path"
+    rmdir $tmp
+}
+
+parse_defs_D()
 {
     tmp=`mktemp -d`
     full_path=$tmp/$opt2
     git cat-file blob "$opt1" > "$full_path"
-    ctags -x --c-kinds=+p-m "$full_path" |
-    grep -av "^operator " |
+    ctags -x --language-force=dts "$full_path" |
     awk '{print $1" "$2" "$3}'
     rm "$full_path"
     rmdir $tmp
@@ -171,6 +214,7 @@ test $# -gt 0 || set help
 cmd=$1
 opt1=$2
 opt2=$3
+opt3=$4
 shift
 
 denormalize()

diff --git a/update.py b/update.py
@@ -156,9 +156,10 @@ def update_definitions(self, idxes):
                 hash = db.hash.get(idx)
                 filename = db.file.get(idx)
 
-            if not lib.hasSupportedExt(filename): continue
+            family = lib.getFileFamily(filename);
+            if family == None: continue
 
-            lines = scriptLines('parse-defs', hash, filename)
+            lines = scriptLines('parse-defs', hash, filename, family)
             for l in lines:
                 ident, type, line = l.split(b' ')
                 type = type.decode()
@@ -170,7 +171,8 @@ def update_definitions(self, idxes):
                     else:
                         obj = data.DefList()
 
-                obj.append(idx, type, line)
+                obj.add_family(family)
+                obj.append(idx, type, line, family)
                 if verbose:
                     print(f"def {type} {ident} in #{idx} @ {line}")
                 with defs_lock:
@@ -210,16 +212,23 @@ def update_references(self, idxes):
                 hash = db.hash.get(idx)
                 filename = db.file.get(idx)
 
-            if not lib.hasSupportedExt(filename): continue
+            family = lib.getFileFamily(filename)
+            if family == None: continue
 
-            tokens = scriptLines('tokenize-file', '-b', hash)
+            prefix = b''
+            # Kconfig values are saved as CONFIG_<value>
+            if family == 'K':
+                prefix = b'CONFIG_'
+
+            tokens = scriptLines('tokenize-file', '-b', hash, family)
             even = True
             line_num = 1
             idents = {}
             for tok in tokens:
                 even = not even
                 if even:
-
+                    tok = prefix + tok
+
                     with defs_lock:
                         if db.defs.exists(tok) and lib.isIdent(tok):
                             if tok in idents:
@@ -236,7 +245,7 @@ def update_references(self, idxes):
                 else:
                     obj = data.RefList()
 
-                obj.append(idx, lines)
+                obj.append(idx, lines, family)
                 if verbose:
                     print(f"ref: {ident} in #{idx} @ {lines}")
                 db.refs.put(ident, obj)
@@ -274,7 +283,8 @@ def update_doc_comments(self, idxes):
                 hash = db.hash.get(idx)
                 filename = db.file.get(idx)
 
-            if not lib.hasSupportedExt(filename): continue
+            family = lib.getFileFamily(filename)
+            if family == None: continue
 
             lines = scriptLines('parse-docs', hash, filename)
             for l in lines:
@@ -286,7 +296,7 @@ def update_doc_comments(self, idxes):
                 else:
                     obj = data.RefList()
 
-                obj.append(idx, str(line))
+                obj.append(idx, str(line), family)
                 if verbose:
                     print(f"doc: {ident} in #{idx} @ {line}")
                 db.docs.put(ident, obj)