diff --git a/src/eval.py b/src/eval.py new file mode 100644 index 0000000..2a74fbd --- /dev/null +++ b/src/eval.py @@ -0,0 +1,76 @@ +import json, sys, rdflib +from topic_assistant import TopicAssistant +import pandas as pd +from rdflib.namespace import RDF, SKOS + +g = rdflib.Graph() + +mapping = {} +result = g.parse("https://raw.githubusercontent.com/openeduhub/oeh-metadata-vocabs/master/oehTopics.ttl", format="ttl") +for s, p, o in g.triples((None, SKOS.relatedMatch, None)): + #print (s, o) + #if o.endswith("460"): + #print (s, o) + mapping[str(s)]=o + + +a = TopicAssistant() + + +#df = pd.read_csv("../../wlo-classification/data/wirlernenonline.oeh.csv",sep=',') +df = pd.read_csv("wirlernenonline2_wokw.csv",sep=',') +df.columns = ['discipline', 'text'] + +atfirst = 0 +atsecond = 0 +atall = 0 +notfound = 0 +num = 0 + +for index, row in df.iterrows(): + num+=1 + if (num>1000): + break + gtdis = row['discipline'] + + print ("########################################################") + print (row['text']) + + result = a.go(row['text']) + match = False + idx = 0 + if not 'children' in result['WLO'].keys(): + notfound+=1 + else: + for i in range(len(result['WLO']['children'])): + if (idx<=i): + for k in result['WLO']['children'][idx].keys(): + uri = result['WLO']['children'][idx][k]['data']['uri'] + label = result['WLO']['children'][idx][k]['data']['label'] + #print (uri, label) + dis = mapping[uri].replace('http://w3id.org/openeduhub/vocabs/discipline/','') + #print (dis) + idx+=1 + + print (gtdis, dis) + + if idx==1 and gtdis == dis: + atfirst+=1 + if idx==2 and gtdis == dis: + atsecond+=1 + if gtdis == dis: + atall+=1 + match = True + break + print ("") + print (atfirst, atsecond, atall, num, notfound, match) + + relfirst = atfirst/num + relsecond = atsecond/num + relall = atall/num + + relfs = (atfirst+atsecond)/num + + print (relfirst, relsecond, relfs, relall) + + diff --git a/src/topic_assistant.ipynb b/src/topic_assistant.ipynb new file mode 100644 index 0000000..1ff96f2 --- /dev/null +++ b/src/topic_assistant.ipynb @@ -0,0 +1,403 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "# -*- coding: utf-8 -*-\n", + "\n", + "#!pip3 install rdflib\n", + "#!pip3 install treelib\n", + "\n", + "# https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/oehTopics.ttl\n", + "\n", + "import re, rdflib,json\n", + "from treelib import Node, Tree\n", + "from rdflib.namespace import RDF, SKOS\n", + "from rdflib import URIRef\n", + "import nltk\n", + "#nltk.download('stopwords')\n", + "from nltk.corpus import stopwords\n", + "STOPWORDS = set(stopwords.words('german')).union(set(stopwords.words('english')))\n", + "\n", + "\n", + "import time\n", + "\n", + "\n", + "class TopicAssistent:\n", + "\n", + " def normalize(self, s):\n", + " s = re.sub('[^A-Za-z0-9öüäÖÄÜß]+', ' ', s)\n", + " return s.lower()\n", + "\n", + "\n", + " def __init__(self):\n", + "\n", + " # create a RDF graph\n", + " g = rdflib.Graph()\n", + "\n", + " # parse in an RDF file hosted on the Internet\n", + " #result = g.parse(\"https://raw.githubusercontent.com/openeduhub/oeh-metadata-vocabs/master/oehTopics.ttl\", format=\"ttl\")\n", + " result = g.parse(\"oehTopics.ttl\", format=\"ttl\")\n", + "\n", + " tree = Tree()\n", + " #find top level node\n", + " for s, p, o in g.triples((None, RDF.type, SKOS.ConceptScheme)):\n", + " #print (s, p, o)\n", + " tree.create_node(\"WLO\", s, data={'w':0, 'uri': s})\n", + " for s2, p2, o2 in g.triples((s, SKOS.hasTopConcept, None)):\n", + " #print (s2, p2, o2)\n", + " tree.create_node(o2, o2, parent=s, data={'w':0, 'uri': o2})\n", + " #break\n", + " \n", + " foundSth = True\n", + " #für jeden Knoten finde Kindknoten\n", + " while foundSth:\n", + " foundSth = False\n", + " #print (len(tree))\n", + " for node in tree.all_nodes():\n", + " #print (node.tag)\n", + " n = URIRef(node.tag)\n", + " for s, p, o in g.triples((None, SKOS.broader, n)):\n", + " #print (s, tree.contains(s))\n", + " if not tree.contains(s):\n", + " tree.create_node(s, s, parent=node, data={'w':0})\n", + " foundSth = True\n", + "\n", + "\n", + " for node in tree.all_nodes():\n", + " for s, p, o in g.triples(( URIRef(node.identifier) , SKOS.prefLabel, None)):\n", + " node.tag=o\n", + " node.data['label']=o\n", + " #print (o)\n", + "\n", + "\n", + " ## create list of keywords\n", + " keywords={}\n", + " for s, p, o in g.triples((None, URIRef(\"https://schema.org/keywords\"), None)):\n", + " #print (s, o)\n", + " n = self.normalize(o)\n", + " if len(n)>2:\n", + " try:\n", + " keywords[s].append(n)\n", + " except:\n", + " keywords[s]=[]\n", + " keywords[s].append(n)\n", + "\n", + " # prefLabel\n", + " for s, p, o in g.triples(( None , SKOS.prefLabel, None)):\n", + " n = self.normalize(o)\n", + " #print (n)\n", + " if len(n)>2:\n", + " try:\n", + " if not n in keywords[s]:\n", + " keywords[s].append(n)\n", + " except:\n", + " keywords[s]=[]\n", + " keywords[s].append(n)\n", + " \n", + " self.keywords = keywords\n", + " self.tree = tree\n", + "\n", + "\n", + " def go(self, exampleText):\n", + " start = time.process_time()\n", + " newTree = Tree(self.tree, deep=True)\n", + " print(\"init: \", time.process_time() - start)\n", + " \n", + " start = time.process_time()\n", + " t=[]\n", + " for tok in self.normalize(exampleText).split(' '):\n", + " if not tok in STOPWORDS:\n", + " t.append(tok)\n", + " ntext = \" \" + \" \".join(t) + \" \"\n", + " #print (ntext)\n", + " print(\"stopwords: \", time.process_time() - start)\n", + " \n", + " start = time.process_time()\n", + " for c in self.keywords.keys():\n", + " for k in self.keywords[c]:\n", + " if ntext.find(\" \" + k + \" \")>-1 : \n", + " newTree.get_node(c).data['w']=newTree.get_node(c).data['w']+1\n", + " newTree.get_node(c).data['match']=k \n", + " print(\"keywords: \", time.process_time() - start)\n", + " \n", + " start = time.process_time()\n", + " # propagate data to the root\n", + " for d in range (newTree.depth(), -1, -1): \n", + " #print (\"L\", d)\n", + " for node in newTree.all_nodes():\n", + " if d == newTree.depth(node):\n", + " if node.data!=None and node.data['w'] > 0: \n", + " p = newTree.parent(node.identifier)\n", + " if p:\n", + " p.data['w'] = p.data['w'] + node.data['w']\n", + " print(\"propergate: \", time.process_time() - start)\n", + "\n", + " #calcTree = Tree(newTree, deep=True)\n", + " #print (len(calcTree))\n", + " start = time.process_time()\n", + " for node in newTree.all_nodes(): \n", + " #print (node, node.is_root())\n", + " if node and not node.is_root() and node.data!=None and node.data['w'] == 0:\n", + " #print (node.data, newTree.parent(node.identifier).data)\n", + " if (newTree.contains(node.identifier)):\n", + " newTree.remove_node(node.identifier)\n", + " else:\n", + " if node and not node.is_root() and node.data!=None:\n", + " node.tag = node.tag + \" (\" + str(node.data['w']) + \")\"\n", + " if 'match' in node.data.keys():\n", + " node.tag = node.tag + \" [\" + node.data['match'] + \"]\"\n", + " print(\"clear: \", time.process_time() - start)\n", + " start = time.process_time()\n", + "\n", + " newTree.show(key=lambda node: node.data[\"w\"], reverse=True, idhidden=True) \n", + " \n", + " print(\"show: \", time.process_time() - start)\n", + " #return json.dumps(calcTree.to_dict(with_data=True, key=lambda node: node.data[\"w\"], sort=True, reverse=True))\n", + "\n", + "T=TopicAssistent()\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init: 0.3646789999999953\n", + "stopwords: 3.300000000194814e-05\n", + "keywords: 0.007322000000002049\n", + "propergate: 0.25916499999999587\n", + "clear: 0.08443800000000579\n", + "WLO\n", + "├── Englisch (5) [englisch]\n", + "│ ├── Grammatik (3) [grammatik]\n", + "│ │ └── Verben (2) [verben]\n", + "│ │ └── Past (1) [past]\n", + "│ └── Sprache und Aussprache (1)\n", + "│ └── False friends (1) [false friends]\n", + "├── Deutsch als Zweitsprache (4)\n", + "│ ├── Grammatik (3) [grammatik]\n", + "│ │ ├── Adverbien (1)\n", + "│ │ │ └── Temporaladverbien (1) [heute]\n", + "│ │ └── Verben (1) [verben]\n", + "│ └── Wortschatz (1)\n", + "│ └── Schule und Studium (1) [englisch]\n", + "├── Türkisch (2)\n", + "│ └── Grammatik (2) [grammatik]\n", + "│ └── Verben (1) [verben]\n", + "├── Spanisch (2)\n", + "│ └── Grammatik (2) [grammatik]\n", + "│ └── Verben (1) [verben]\n", + "└── Deutsch (1)\n", + " └── Grammatik und Sprache untersuchen (1)\n", + " └── Wortarten (1)\n", + " └── Verben (1) [verben]\n", + "\n", + "show: 0.0004189999999795191\n", + "total: 0.7168759999999992\n" + ] + } + ], + "source": [ + "start = time.process_time()\n", + "T.go(\"Im Englisch Unterricht behandeln wir heute Verben, Past Perfect und False Friends\")\n", + "print(\"total: \", time.process_time() - start)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init: 0.3466449999999952\n", + "stopwords: 5.099999999913507e-05\n", + "keywords: 0.009220999999982382\n", + "propergate: 0.27414699999999925\n", + "clear: 0.10268999999999551\n", + "WLO\n", + "├── Mathematik (3)\n", + "│ └── Analytische Geometrie (3)\n", + "│ ├── Lagebeziehungen (1)\n", + "│ │ └── Lineares Gleichungssystem (1) [gauß]\n", + "│ ├── Lineares Gleichungssystem (1) [gauß]\n", + "│ └── Vektorrechnung (1)\n", + "│ └── Lineare Abhängigkeit (1) [lineare abhängigkeit]\n", + "├── Englisch (2)\n", + "│ └── Themen und Wortschatz (2)\n", + "│ └── Geschichte (2)\n", + "│ ├── Geschichte der USA (1) [unabhängigkeit]\n", + "│ └── wichtige Ereignisse (1) [unabhängigkeit]\n", + "└── Medienbildung (1)\n", + " └── Problemlösen, Handeln und Modellieren (1)\n", + " └── Modellieren und Programmieren (1) [algorithmus]\n", + "\n", + "show: 0.0004409999999950287\n" + ] + } + ], + "source": [ + "T.go(\"Lineare Abhängigkeit / Unabhängigkeit Lineare Abhängigkeit _ Unabhängigkeit Linearkombination linear abhängig Nullvektor Gauß Algorithmus Determinante\")" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init: 0.3300780000000003\n", + "stopwords: 5.000000001587068e-05\n", + "keywords: 0.009317999999993276\n", + "propergate: 0.2837999999999994\n", + "clear: 0.17435100000000148\n", + "WLO\n", + "└── Chemie (15)\n", + " ├── Chemie und Umwelt (14)\n", + " │ ├── Dieselmotoren und Stickoxide (10) [nox]\n", + " │ ├── Luft und Luftverschmutzung (3) [feinstaub]\n", + " │ │ └── Die Ozon-Problematik (1) [stickoxide]\n", + " │ └── Chemie in der Landwirtschaft (1) [skandal]\n", + " └── Organische Chemie (1)\n", + " └── Die Kohlenwasserstoffe als Grundstoffe in der Organischen Chemie (1)\n", + " └── Eigenschaften der  Kohlenwasserstoffe (1) [diesel]\n", + "\n", + "show: 0.0002950000000083719\n" + ] + } + ], + "source": [ + "T.go(\"Dieselmotoren und Stickoxide Dieselmotoren und Stickoxide AdBlue Urethan Stickoxid NO NOx NO2 N2O N2O4 Fahrverbot VW Diesel-Skandal Feinstaub\")" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init: 0.33034800000000075\n", + "stopwords: 0.0001160000000055561\n", + "keywords: 0.011819999999985953\n", + "propergate: 0.29611900000000446\n", + "clear: 0.08611299999999744\n", + "WLO\n", + "├── Religion (5)\n", + "│ ├── Ethik (3)\n", + "│ │ └── Verantwortung im Konflikt (3)\n", + "│ │ └── Beispiele für Gerechtigkeit und Ungerechtigkeit (3) [familie]\n", + "│ └── Menschsein und die Suche nach Gott (2)\n", + "│ └── Identität und Beziehung (2)\n", + "│ └── Beziehung (2) [beziehung]\n", + "├── Sport (4)\n", + "│ └── Ballsport (4)\n", + "│ ├── Indiaca (1) [netz]\n", + "│ ├── Badminton (1) [netz]\n", + "│ ├── Tennis (1) [netz]\n", + "│ └── Volleyball (1) [netz]\n", + "├── Deutsch als Zweitsprache (3)\n", + "│ └── Wortschatz (3)\n", + "│ ├── Miteinander, Freunde und Familie (2) [familie]\n", + "│ └── Schule und Studium (1) [schule]\n", + "├── Englisch (3)\n", + "│ └── Themen und Wortschatz (3)\n", + "│ ├── soziale Beziehungen (2)\n", + "│ │ ├── soziale Medien (1) [mobbing]\n", + "│ │ └── Familie (1) [familie]\n", + "│ └── Medien (1)\n", + "│ └── Internet (1) [internet]\n", + "├── Informatik (3)\n", + "│ ├── Praktische Informatik (1)\n", + "│ │ └── Abstrakte Datentypen / -strukturen (1)\n", + "│ │ └── Graphen (1) [netz]\n", + "│ ├── Technische Informatik (1)\n", + "│ │ └── Netzwerke / OSI (1)\n", + "│ │ └── Internet (1) [internet]\n", + "│ └── Grundlagen (1)\n", + "│ └── Anwendungen (1)\n", + "│ └── E-Mail (1) [signatur]\n", + "├── Türkisch (2)\n", + "│ └── Wortschatz (2)\n", + "│ ├── Verwandschaftsbezeichnungen (1) [familie]\n", + "│ └── Computer und soziale Medien (1) [internet]\n", + "├── Medienbildung (2)\n", + "│ ├── Informieren und Recherchieren (1)\n", + "│ │ └── Kritik üben und Kommentieren (1) [cybermobbing]\n", + "│ └── Analysieren und Reflektieren (1)\n", + "│ └── Selbstgesteuerte Mediennutzung fördern (1) [cybermobbing]\n", + "└── Politische Bildung (2)\n", + " └── Digitalisierung (2)\n", + " ├── Medien (1) [cybermobbing]\n", + " └── Bildung und Kultur (1) [schule]\n", + "\n", + "show: 0.0007330000000109749\n" + ] + } + ], + "source": [ + "T.go(\"Elli Online — Voll daneben! - Cybermobbing_1595499758626 Elli passiert im Sportunterricht ein harmloses Missgeschick. Dieser Vorfall ist Auslöser für eine erschreckende Entwicklung. Elli wird gemobbt. Es beginnt mit anonymen Beleidigungen im Netz. Daraus entwickelt sich eine Welle von Hohn und Spott, die Elli überrollt. Nirgends kann sie den Beschimpfungen entfliehen, nicht im Internet und nicht in der Schule. Aber Elli ist nicht alleine. Sie hat Cosmo, ihre Freunde, ihre Familie und eine engagierte Lehrerin. Gemeinsam beratschlagen sie, was sie tun können, um das Mobbing zu stoppen. In der Realwelt und im Netz. (Online-Signatur Medienzentren: 4985739) Schüler-Schüler-Beziehung Internet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/topic_assistant.py b/src/topic_assistant.py index 056cef5..d443bd0 100644 --- a/src/topic_assistant.py +++ b/src/topic_assistant.py @@ -23,12 +23,37 @@ def normalize(self, s): def __init__(self): - # create a RDF graph + # collect discipline labels + self.disciplineLabels={} + gdis = rdflib.Graph() + result = gdis.parse("https://raw.githubusercontent.com/openeduhub/oeh-metadata-vocabs/master/discipline.ttl", format="ttl") + for s, p, o in gdis.triples((None, SKOS.prefLabel, None)): + try: + self.disciplineLabels[s].append(str(o)) + except: + self.disciplineLabels[s]=[str(o)] + for s, p, o in gdis.triples((None, SKOS.altLabel, None)): + try: + self.disciplineLabels[s].append(str(o)) + except: + self.disciplineLabels[s]=[str(o)] + + #print (self.disciplineLabels) + + # create an RDF graph fo rthe topics g = rdflib.Graph() result = g.parse("https://raw.githubusercontent.com/openeduhub/oeh-metadata-vocabs/master/oehTopics.ttl", format="ttl") #result = g.parse("oehTopics.ttl", format="ttl") + # collect discipline mappings + self.disciplineMappings={} + for s, p, o in g.triples((None, SKOS.relatedMatch, None)): + for s2, p2, o2 in g.triples((s, SKOS.topConceptOf, None)): + self.disciplineMappings[s]=o + + + # build the topic tree tree = Tree() #find top level node for s, p, o in g.triples((None, RDF.type, SKOS.ConceptScheme)): @@ -36,7 +61,7 @@ def __init__(self): tree.create_node("WLO", s, data={'w':0, 'uri': s}) for s2, p2, o2 in g.triples((s, SKOS.hasTopConcept, None)): #print (s2, p2, o2) - tree.create_node(o2, o2, parent=s, data={'w':0, 'uri': o2}) + tree.create_node(o2, o2, parent=s, data={'w':0, 'uri': str(o2)}) foundSth = True while foundSth: @@ -48,23 +73,24 @@ def __init__(self): tree.create_node(s, s, parent=node, data={'w':0}) foundSth = True - + # collect the labels for node in tree.all_nodes(): for s, p, o in g.triples(( URIRef(node.identifier) , SKOS.prefLabel, None)): node.tag=o node.data['label']=o + # collect the "index terms" from keywords, preflabels, and discipline labels keywords={} for s, p, o in g.triples((None, URIRef("https://schema.org/keywords"), None)): #print (s, o) - n = self.normalize(o) - if len(n)>2: - try: - keywords[s].append(n) - except: - keywords[s]=[] - keywords[s].append(n) + for k in str(o).split(','): + n = self.normalize(k) + if len(n)>2: + try: + keywords[s].append(n) + except: + keywords[s]=[n] for s, p, o in g.triples(( None , SKOS.prefLabel, None)): n = self.normalize(o) @@ -73,12 +99,27 @@ def __init__(self): if not n in keywords[s]: keywords[s].append(n) except: - keywords[s]=[] - keywords[s].append(n) + keywords[s]=[n] + + if s in self.disciplineMappings.keys(): + disciplines = self.disciplineLabels[self.disciplineMappings[s]] + for d in disciplines: + n = self.normalize(d) + try: + if not n in keywords[s]: + keywords[s].append(n) + except: + keywords[s]=[n] + + self.keywords = keywords self.tree = tree + #for k in keywords.keys(): + # print(k, keywords[k]) + #tree.show(key=lambda node: node.data["w"], reverse=True, idhidden=True) + #sys.exit() def go(self, exampleText): T = Tree(self.tree, deep=True) @@ -94,7 +135,10 @@ def go(self, exampleText): if ntext.find(" " + k + " ")>-1 : T.get_node(c).data['w']=T.get_node(c).data['w']+1 - T.get_node(c).data['match']=k + try: + T.get_node(c).data['match']=T.get_node(c).data['match'] + ", " + k + except: + T.get_node(c).data['match']=k #print (c, k) # propagate data to the root