diff --git a/orangecontrib/storynavigation/netviz/graphview.py b/orangecontrib/storynavigation/netviz/graphview.py new file mode 100644 index 0000000..b3c7e4a --- /dev/null +++ b/orangecontrib/storynavigation/netviz/graphview.py @@ -0,0 +1,413 @@ +"""The graph class reused/modified from the original one defined in the Orange3 Network add-on: https://github.com/biolab/orange3-network + +""" + +import time + +import numpy as np +import pyqtgraph as pg + +from AnyQt.QtCore import QLineF, Qt, QRectF +from AnyQt.QtGui import QPen, QColor + +from Orange.util import scale +from Orange.widgets.settings import Setting +from Orange.widgets.visualize.owscatterplotgraph import OWScatterPlotBase + + +class PlotVarWidthCurveItem(pg.PlotCurveItem): + def __init__(self, directed, *args, **kwargs): + self.directed = directed + self.widths = kwargs.pop("widths", None) + self.setPen(kwargs.pop("pen", pg.mkPen(0.0))) + self.sizes = kwargs.pop("size", None) + self.selection = kwargs.pop("selection", None) + self.coss = self.sins = None + super().__init__(*args, **kwargs) + + def setWidths(self, widths): + self.widths = widths + self.update() + + def setPen(self, pen): + self.pen = pen + self.pen.setCapStyle(Qt.RoundCap) + + def setData(self, *args, **kwargs): + self.widths = kwargs.pop("widths", self.widths) + self.setPen(kwargs.pop("pen", self.pen)) + self.sizes = kwargs.pop("size", self.sizes) + self.selection = kwargs.pop("selection", self.selection) + super().setData(*args, **kwargs) + + def paint(self, p, opt, widget): + def get_arrows(): + # Compute (n, 4) array of coordinates of arrows' ends + # Arrows are at 15 degrees; length is 10, clipped to edge length + x0, y0, x1, y1 = edge_coords.T + arr_len = np.clip(lengths - sizes1 - w3, 0, 10) + cos12 = arr_len * np.cos(np.pi / 12) + sin12 = arr_len * np.sin(np.pi / 12) + + # cos(a ± 15) = cos(a) cos(15) ∓ sin(a) sin(15) + tx = sins * (fx * sin12) + x1a = x1 - coss * (fx * cos12) + x2a = x1a - tx + x1a += tx + + # sin(a ± 15) = sin(a) cos(15) ± sin(15) cos(a) + ty = (fy * sin12) * coss + y1a = y1 + sins * (fy * cos12) + y2a = y1a - ty + y1a += ty + return np.vstack((x1a, y1a, x2a, y2a)).T + + def get_short_edge_coords(): + # Compute the target-side coordinates of edges with arrows + # Such edges are shorted by 8 pixels + width / 3 + off = 8 + w3 + return edge_coords[:, 2:] + (off * np.vstack((-fxcos, fysin))).T + + if self.xData is None or len(self.xData) == 0: + return + + # Widths of edges, divided by 3; used for adjusting sizes + w3 = (self.widths if self.widths is not None else self.pen.width()) / 3 + + # Sizes of source and target nodes; they are used for adjusting the + # edge lengths, so we increase the sizes by edge widths / 3 + sizes0, sizes1 = self.sizes[::2] + w3, self.sizes[1::2] + w3 + + # Coordinates of vertices for all end points (in real world) + x0s, x1s = self.xData[::2], self.xData[1::2] + y0s, y1s = self.yData[::2], self.yData[1::2] + + # Factors for transforming real-worlds coordinates into pixels + fx = 1 / p.worldTransform().m11() + fy = 1 / p.worldTransform().m22() + + # Computations of angles (lengths are also used to clip the arrows) + # We need sine and cosine of angles, and never the actual angles. + # Sine and cosine are compute as ratios in triangles rather than with + # trigonometric functions + diffx, diffy = (x1s - x0s) / fx, -(y1s - y0s) / fy + lengths = np.sqrt(diffx ** 2 + diffy ** 2) + arcs = lengths == 0 + coss, sins = np.nan_to_num(diffx / lengths), np.nan_to_num(diffy / lengths) + + # A slower version of the above, with trigonometry + # angles = np.arctan2(-(y1s - y0s) / fy, (x1s - x0s) / fx) + # return np.cos(angles), np.sin(angles) + + # Sin and cos are mostly used as mulitplied with fx and fy; precompute + fxcos, fysin = fx * coss, fy * sins + + # Coordinates of edges' end points: coordinates of vertices, adjusted + # by sizes. When drawing arraws, the target coordinate is used for + # the tip of the arrow, not the edge + edge_coords = np.vstack((x0s + fxcos * sizes0, y0s - fysin * sizes0, + x1s - fxcos * sizes1, y1s + fysin * sizes1)).T + + pen = QPen(self.pen) + SHOW_COLOR = QColor(0, 0, 0, 255) + HIDE_COLOR = QColor(0, 0, 0, 10) + p.setRenderHint(p.Antialiasing, True) + p.setCompositionMode(p.CompositionMode_SourceOver) + if self.widths is None: + if self.directed: + for (x0, y0, x1, y1), (x1w, y1w), (xa1, ya1, xa2, ya2), arc, sel in zip( + edge_coords, get_short_edge_coords(), get_arrows(), arcs, self.selection): + if not arc: + pen.setColor(SHOW_COLOR if sel else HIDE_COLOR) + p.setPen(pen) + p.drawLine(QLineF(x0, y0, x1w, y1w)) + p.drawLine(QLineF(xa1, ya1, x1, y1)) + p.drawLine(QLineF(xa2, ya2, x1, y1)) + else: + for ecoords in edge_coords[~arcs]: + p.setPen(pen) + p.drawLine(QLineF(*ecoords)) + else: + if self.directed: + for (x0, y0, x1, y1), (x1w, y1w), (xa1, ya1, xa2, ya2), w, arc, sel in zip( + edge_coords, get_short_edge_coords(), get_arrows(), + self.widths, arcs, self.selection): + if not arc: + pen.setColor(SHOW_COLOR if sel else HIDE_COLOR) + pen.setWidth(w) + p.setPen(pen) + p.drawLine(QLineF(x0, y0, x1w, y1w)) + p.drawLine(QLineF(xa1, ya1, x1, y1)) + p.drawLine(QLineF(xa2, ya2, x1, y1)) + else: + for ecoords, w in zip(edge_coords[~arcs], self.widths[~arcs]): + pen.setWidth(w) + p.setPen(pen) + p.drawLine(QLineF(*ecoords)) + + # This part is not so optimized because there can't be that many loops + if np.any(arcs): + xs, ys = self.xData[::2][arcs], self.yData[1::2][arcs] + sizes = self.sizes[::2][arcs] + sizes += w3 if isinstance(w3, float) else w3[arcs] + # if radius of loop would be size, then distance betwween + # vertex and loop centers would be + # d = np.sqrt(size ** 2 - r ** 2 / 2) + r / np.sqrt(2) + r / 2 + ds = sizes * (1 + np.sqrt(2) / 4) + rxs = xs - ds * fx + rys = ys - ds * fy + rfxs = sizes * fx + rfys = sizes * fy + + ax0o = 6 * np.cos(np.pi * 5 / 6) * fx + ax1o = 6 * np.cos(np.pi * 7 / 12) * fx + ay0o = 6 * np.sin(np.pi * 5 / 6) * fy + ay1o = 6 * np.sin(np.pi * 7 / 12) * fy + + if self.widths is None: + widths = np.full(len(rxs), pen.width()) + else: + widths = self.widths[arcs] + for rx, ry, rfx, rfy, w in zip(rxs, rys, rfxs, rfys, widths): + rect = QRectF(rx, ry, rfx, rfy) + pen.setWidth(w) + p.setPen(pen) + p.drawArc(rect, 100 * 16, 250 * 16) + if self.directed: + rx += 1.1 * rfx + ry += rfy / 2 + p.drawLine(QLineF(rx, ry, rx + ax0o, ry - ay0o)) + p.drawLine(QLineF(rx, ry, rx + ax1o, ry - ay1o)) + + +class GraphView(OWScatterPlotBase): + show_edge_weights = Setting(False) + relative_edge_widths = Setting(True) + edge_width = Setting(2) + label_selected_edges = Setting(True) + + COLOR_NOT_SUBSET = (255, 255, 255, 255) + COLOR_SUBSET = (0, 0, 0, 255) + COLOR_DEFAULT = (255, 255, 255, 0) + + class Simplifications: + NoLabels, NoEdges, NoEdgeLabels, NoDensity, = 1, 2, 4, 8 + NoSimplifications, All = 0, 255 + + def __init__(self, master, parent=None): + super().__init__(master) + self._reset_attributes() + self.simplify = self.Simplifications.NoSimplifications + self.step_resizing.connect(self.update_edges) + self.end_resizing.connect(self.update_edges) + + def clear(self): + super().clear() + self._reset_attributes() + + def _reset_attributes(self): + self.pair_indices = None + self.edge_curve = None + self.edge_labels = [] + self.scatterplot_marked = None + self.last_click = (-1, None) + + def update_coordinates(self): + super().update_coordinates() + self.update_marks() + self.update_edges() + + def set_simplifications(self, simplifications): + S = self.Simplifications + for flag, remove, update in ( + (S.NoDensity, self._remove_density, self.update_density), + (S.NoLabels, self._remove_labels, self.update_labels), + (S.NoEdges, self._remove_edges, self.update_edges), + (S.NoEdgeLabels, + self._remove_edge_labels, self.update_edge_labels)): + if simplifications & flag != self.simplify & flag: + if simplifications & flag: + self.simplify += flag + remove() + else: + self.simplify -= flag + update() + + def update_edges(self): + if not self.scatterplot_item \ + or self.simplify & self.Simplifications.NoEdges: + return + x, y = self.scatterplot_item.getData() + edges = self.master.get_edges() + srcs = edges['source'].to_numpy() + dests = edges['target'].to_numpy() + weights = edges['weight'].to_numpy() # pen width needs to be int + if self.edge_curve is None: + self.pair_indices = np.empty((2 * len(srcs),), dtype=int) + self.pair_indices[::2] = srcs + self.pair_indices[1::2] = dests + + data = dict(x=x[self.pair_indices], y=y[self.pair_indices], + pen=self._edge_curve_pen(), antialias=True, + size=self.scatterplot_item.data["size"][self.pair_indices] / 2) + if self.relative_edge_widths and len(set(weights)) > 1: + data['widths'] = scale(weights, .7, 8) * np.log2(self.edge_width / 4 + 1) + data['widths'] = data['widths'].astype(int) + else: + data['widths'] = None + + # TODO: find edges that should be hidden based on the node selection + if self.selection is None: + edge_mark = np.ones(len(edges.index), dtype=int).tolist() + else: + edge_mark = np.zeros(len(edges.index), dtype=int).tolist() + selected = [i for i, v in enumerate(self.selection) if v] + relevant_edges = edges[edges['source'].isin(selected) | edges['target'].isin(selected)] + for i in relevant_edges.index.tolist(): + edge_mark[i] = 1 + data['selection'] = edge_mark + + if self.edge_curve is None: + self.edge_curve = PlotVarWidthCurveItem(True, **data) + self.edge_curve.setZValue(-10) + self.plot_widget.addItem(self.edge_curve) + else: + self.edge_curve.setData(**data) + self.update_edge_labels() + + def set_edge_pen(self): + if self.edge_curve: + self.edge_curve.setPen(self._edge_curve_pen()) + + def _edge_curve_pen(self): + return pg.mkPen({ + 'color': '#000', + 'width': self.edge_width, + 'cosmetic': True + }) + + def update_edge_labels(self): + for label in self.edge_labels: + self.plot_widget.removeItem(label) + self.edge_labels = [] + if self.scatterplot_item is None \ + or not self.show_edge_weights \ + or self.simplify & self.Simplifications.NoEdgeLabels: + return + edges = self.master.get_edges() + if edges is None: + return + srcs, dests, weights = edges.row, edges.col, edges.data + if self.label_selected_edges: + selected = self._selected_and_marked() + num_selected = np.sum(selected) + if num_selected >= 2: + selected_edges = selected[srcs] & selected[dests] + else: + selected_edges = selected[srcs] | selected[dests] + srcs = srcs[selected_edges] + dests = dests[selected_edges] + weights = weights[selected_edges] + if np.allclose(weights, np.round(weights)): + labels = [str(x) for x in weights.astype(np.int)] + else: + labels = ["{:.02}".format(x) for x in weights] + x, y = self.scatterplot_item.getData() + xs = (x[srcs.astype(np.int64)] + x[dests.astype(np.int64)]) / 2 + ys = (y[srcs.astype(np.int64)] + y[dests.astype(np.int64)]) / 2 + black = pg.mkColor(0, 0, 0) + for label, x, y in zip(labels, xs, ys): + ti = pg.TextItem(label, black) + ti.setPos(x, y) + self.plot_widget.addItem(ti) + self.edge_labels.append(ti) + + def _remove_edges(self): + if self.edge_curve: + self.plot_widget.removeItem(self.edge_curve) + self.edge_curve = None + self._remove_edge_labels() + + def _remove_edge_labels(self): + for label in self.edge_labels: + self.plot_widget.removeItem(label) + self.edge_labels = [] + + def update_density(self): + if not self.simplify & self.Simplifications.NoDensity: + super().update_density() + self.set_edge_pen() + + # pylint: disable=access-member-before-definition + def _remove_density(self): + if self.density_img: + self.plot_widget.removeItem(self.density_img) + self.density_img = None + + def _selected_and_marked(self): + if self.selection is None: + selection = np.zeros(len(self.scatterplot_item.data), dtype=bool) + else: + selection = np.array(self.selection, dtype=bool) + marked = self.master.get_marked_nodes() + if marked is not None: + selection[marked] = 1 + return selection + + def update_labels(self): + if self.simplify & self.Simplifications.NoLabels: + return + # This is not nice, but let's not add methods to the parent just + # to support this specific needs of network explorer + # pylint: disable=access-member-before-definition + saved_selection = self.selection + if self.label_only_selected and self.scatterplot_item: + marked = self.master.get_marked_nodes() + if marked is not None and len(marked): + self.selection = self._selected_and_marked() + super().update_labels() + self.selection = saved_selection + + def _remove_labels(self): + # pylint: disable=access-member-before-definition + for label in self.labels: + self.plot_widget.removeItem(label) + self.labels = [] + + def update_marks(self): + if self.scatterplot_marked is None: + self.scatterplot_marked = pg.ScatterPlotItem([], []) + self.scatterplot_marked.setZValue(-5) + self.plot_widget.addItem(self.scatterplot_marked) + + self.update_edge_labels() + x, y = self.get_coordinates() + labels = self.master.get_node_labels().to_numpy() + if x is None: # sanity check; there can be no marked nodes if x is None + return + self.scatterplot_marked.clear() + self.scatterplot_marked.addPoints( + x[labels=='supportive'], y[labels=='supportive'], + size=25, pen=pg.mkPen('green', width=3), brush=pg.mkBrush(None)) + self.scatterplot_marked.addPoints( + x[labels=='defeated'], y[labels=='defeated'], + size=25, pen=pg.mkPen('red', width=3), brush=pg.mkBrush(None)) + + def select_by_click(self, _, points): + # Poor man's double click + indices = [p.data() for p in points] + last_time, last_indices = self.last_click + self.last_click = (time.time(), indices) + if time.time() - last_time < 0.5 and indices == last_indices: + indices = self.master.get_reachable(indices) + self.select_by_indices(indices) + + def unselect_all(self): + super().unselect_all() + if self.label_selected_edges: + self.update_edge_labels() + + def _update_after_selection(self): + if self.label_selected_edges: + self.update_edge_labels() + super()._update_after_selection() \ No newline at end of file diff --git a/orangecontrib/storynavigation/widgets/rules/multilingual_dsg_patterns_en.json b/orangecontrib/storynavigation/rules/multilingual_dsg_patterns_en.json similarity index 100% rename from orangecontrib/storynavigation/widgets/rules/multilingual_dsg_patterns_en.json rename to orangecontrib/storynavigation/rules/multilingual_dsg_patterns_en.json diff --git a/orangecontrib/storynavigation/widgets/rules/multilingual_dsg_patterns_nl.json b/orangecontrib/storynavigation/rules/multilingual_dsg_patterns_nl.json similarity index 100% rename from orangecontrib/storynavigation/widgets/rules/multilingual_dsg_patterns_nl.json rename to orangecontrib/storynavigation/rules/multilingual_dsg_patterns_nl.json diff --git a/orangecontrib/storynavigation/widgets/tests/__init__.py b/orangecontrib/storynavigation/tests/__init__.py similarity index 100% rename from orangecontrib/storynavigation/widgets/tests/__init__.py rename to orangecontrib/storynavigation/tests/__init__.py diff --git a/orangecontrib/storynavigation/widgets/tests/testOWSNDSGDepParser.py b/orangecontrib/storynavigation/tests/testOWSNDSGDepParser.py similarity index 100% rename from orangecontrib/storynavigation/widgets/tests/testOWSNDSGDepParser.py rename to orangecontrib/storynavigation/tests/testOWSNDSGDepParser.py diff --git a/orangecontrib/storynavigation/widgets/tests/testOWSNDSGRuleset.py b/orangecontrib/storynavigation/tests/testOWSNDSGRuleset.py similarity index 89% rename from orangecontrib/storynavigation/widgets/tests/testOWSNDSGRuleset.py rename to orangecontrib/storynavigation/tests/testOWSNDSGRuleset.py index 183dd09..6d5f39c 100644 --- a/orangecontrib/storynavigation/widgets/tests/testOWSNDSGRuleset.py +++ b/orangecontrib/storynavigation/tests/testOWSNDSGRuleset.py @@ -1,12 +1,12 @@ import unittest # import scipy.sparse as sp -from orangecontrib.storynavigation.widgets.OWSNDSGRuleset import OWSNDSGRuleset +from orangecontrib.storynavigation.widgets.OWSNNarrativeNetwork import OWSNNarrativeNetwork from orangewidget.tests.base import WidgetTest -class TestOWSNDSGRuleset(WidgetTest): +class TestOWSNNarrativeNetwork(WidgetTest): def setUp(self): - self.widget = self.create_widget(OWSNDSGRuleset) + self.widget = self.create_widget(OWSNNarrativeNetwork) # self.small_undir = _create_net(((0, 1, 1.0), (0, 2, 1.0), (1, 2, 1.0), (2, 3, 1.0)), n=5) # self.small_dir = _create_net(((0, 1, 1.0), (0, 2, 1.0), (1, 2, 1.0), (2, 3, 1.0)), n=5, directed=True) # self.empty_net = Network([], UndirectedEdges(sp.coo_matrix((0, 0)))) diff --git a/orangecontrib/storynavigation/widgets/tests/testOWSNDSGSRL.py b/orangecontrib/storynavigation/tests/testOWSNDSGSRL.py similarity index 100% rename from orangecontrib/storynavigation/widgets/tests/testOWSNDSGSRL.py rename to orangecontrib/storynavigation/tests/testOWSNDSGSRL.py diff --git a/orangecontrib/storynavigation/widgets/utils/dutchstopwords.txt b/orangecontrib/storynavigation/utils/dutchstopwords.txt similarity index 100% rename from orangecontrib/storynavigation/widgets/utils/dutchstopwords.txt rename to orangecontrib/storynavigation/utils/dutchstopwords.txt diff --git a/orangecontrib/storynavigation/widgets/utils/qrangeslider.py b/orangecontrib/storynavigation/utils/qrangeslider.py similarity index 100% rename from orangecontrib/storynavigation/widgets/utils/qrangeslider.py rename to orangecontrib/storynavigation/utils/qrangeslider.py diff --git a/orangecontrib/storynavigation/widgets/OWArgExplorer.py b/orangecontrib/storynavigation/widgets/OWArgExplorer.py new file mode 100644 index 0000000..c2c4bb3 --- /dev/null +++ b/orangecontrib/storynavigation/widgets/OWArgExplorer.py @@ -0,0 +1,367 @@ +import numpy as np +import networkx as nx +import sys + +from AnyQt.QtCore import Qt + +from Orange.data import Table +from Orange.widgets import gui +from Orange.widgets.widget import Input +from Orange.widgets.visualize.utils.widget import OWDataProjectionWidget, OWWidget +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin +from Orange.widgets.settings import SettingProvider, Setting +from Orange.widgets.utils.plot import OWPlotGUI +from Orange.data.pandas_compat import table_to_frame +from Orange.widgets.utils.widgetpreview import WidgetPreview + +from orangecontrib.storynavigation.netviz.graphview import GraphView + +import pandas as pd + +""" Copyright 2023, Ji Qi, Netherlands eScience Center, NL, j.qi@esciencecenter.nl' + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.""" + +GRAPH_LAYOUT = ('spring', 'multipartite', 'kamada kawai', 'spectral') + +class OWArgExplorer(OWDataProjectionWidget): + name = 'Narrative Network Explorer' + description = 'Enables visual plotting and exploration of entities anf their relationships in narrative texts' + icon = 'icons/OWArgExplorer.svg' + + class Inputs: + edge_data = Input('Edge Data', Table) + node_data = Input('Node Data', Table) + + GRAPH_CLASS = GraphView + graph = SettingProvider(GraphView) + + node_sparsity = Setting(5) + graph_layout = Setting(GRAPH_LAYOUT[0]) # comboBox widget returns index of the selection + # sourcenode_column_name = Setting(0) + # targetnode_column_name = Setting(1) + # idx = 0 + + def __init__(self): + super().__init__() + + self.edge_data = None + self.node_data = None + self.positions = None + + def _add_controls(self): + self.gui = OWPlotGUI(self) + layout = gui.vBox(self.controlArea, box='Layout') + gui.comboBox(layout, self, 'graph_layout', + label='Graph layout', + sendSelectedValue=True, + items=GRAPH_LAYOUT, + callback=self.relayout) + self.sparsity_control = gui.hSlider(layout, self, "node_sparsity", + minValue=0, maxValue=10, intOnly=False, + label="Node sparsity", orientation=Qt.Horizontal, + callback_finished=self.relayout) + + # edge_selection = gui.vBox(self.controlArea, box='Edge selection') + + # # df_edge = None + # if hasattr(self, 'edge_data'): + # print(table_to_frame(self.edge_data).columns) + # print(type(table_to_frame(self.edge_data).columns)) + # print("yes") + + # gui.comboBox(edge_selection, self, 'sourcenode_column_name', + # label='Source node column', + # sendSelectedValue=True, + # items=table_to_frame(self.edge_data).columns, + # callback=self.relayout) + + # gui.comboBox(edge_selection, self, 'targetnode_column_name', + # label='Target node column', + # sendSelectedValue=True, + # items=table_to_frame(self.edge_data).columns, + # callback=self.relayout) + # else: + # print("here...") + + @Inputs.edge_data + def set_edge_data(self, data): + self.edge_data = data + + @Inputs.node_data + def set_node_data(self, data): + self.node_data = data + + def handleNewSignals(self): + self.AlphaValue = 0 + self.relayout() + + def relayout(self): + """recompute positions of nodes and reset the graph + """ + if self.node_data is None or self.edge_data is None: + return + + self.sparsity_control.setEnabled(self.graph_layout == GRAPH_LAYOUT[0]) + self.set_positions() + self.closeContext() + self.data = self.node_data + self.valid_data = np.full(len(self.data), True, dtype=bool) + self.openContext(self.data) + self.graph.reset_graph() + + + # def table_to_frame_custom(self, tab, include_metas=False): + # """ + # Convert Orange.data.Table to pandas.DataFrame + # Parameters + # ---------- + # tab : Table + # include_metas : bool, (default=False) + # Include table metas into dataframe. + # Returns + # ------- + # pandas.DataFrame + # """ + + # def _column_to_series(col, vals): + # result = () + # if col.is_discrete: + # codes = pd.Series(vals).fillna(-1).astype(int) + # result = (col.name, pd.Categorical.from_codes( + # codes=codes, categories=col.values, ordered=True + # )) + # elif col.is_time: + # result = (col.name, pd.to_datetime(vals, unit='s').to_series().reset_index()[0]) + # elif col.is_continuous: + # dt = float + # # np.nan are not compatible with int column + # # using pd.isnull since np.isnan fails on array with dtype object + # # which can happen when metas contain column with strings + # if col.number_of_decimals == 0 and not np.any(pd.isnull(vals)): + # dt = int + # result = (col.name, pd.Series(vals).astype(dt)) + # elif col.is_string: + # result = (col.name, pd.Series(vals)) + # return result + + # def _columns_to_series(cols, vals): + # return [_column_to_series(col, vals[:, i]) for i, col in enumerate(cols)] + + # x, y, metas = [], [], [] + # domain = tab.domain + # print("domain: ", domain) + # print() + # if domain.attributes: + # print("domain attributes: ", domain.attributes) + # print() + # x = _columns_to_series(domain.attributes, tab.X) + # print("Table X: ", tab.X) + # print() + # if domain.class_vars: + # print("domain class_vars: ", domain.class_vars) + # print() + # y_values = tab.Y.reshape(tab.Y.shape[0], len(domain.class_vars)) + # print("Table Y: ", tab.Y) + # print() + # print("Table Y reshaped: ", y_values) + # print() + # y = _columns_to_series(domain.class_vars, y_values) + # if domain.metas: + # print("domain metas: ", domain.metas) + # print() + # print("Table metas: ", tab.metas) + # print() + + # metas = _columns_to_series(domain.metas, tab.metas) + + # all_series = dict(x + y + metas) + # print("x series: ", x) + # print() + # print("y series: ", y) + # print() + + # all_vars = tab.domain.variables + # print("Table domain variables: ", tab.domain.variables) + # print() + # if include_metas: + # all_vars += tab.domain.metas + # print("all_vars: ", all_vars) + # print() + # original_column_order = [var.name for var in all_vars] + # unsorted_columns_df = pd.DataFrame(all_series) + # print("len: ", len(unsorted_columns_df)) + # print() + # return unsorted_columns_df[original_column_order] + + def set_positions(self): + """set coordinates of nodes to self.positions. + Args: + layout (str, optional): name of layout. Defaults to "sfdp". + """ + df_edge = table_to_frame(self.edge_data) + df_node = table_to_frame(self.node_data, include_metas=True) + + print() + print() + print(df_edge) + print() + print() + + print() + print() + print(df_node) + print() + print() + + # df_edge = self.table_to_frame_custom(self.edge_data, include_metas=True) + # df_node = self.table_to_frame_custom(self.node_data, include_metas=True) + # df_edge = self.edge_data + # df_node = self.node_data + + # df_node = {} + # df_node['label'] = [] + # for item in df_edge: + # df_node['label'].append(item['subject']) + # print("main man") + # df_edge = pd.DataFrame([[1, 2, 0.5],[2, 3, 0.3],[3, 1, 0.2],[3, 4, 0.7],[1, 4, 0.8],[2, 5, 0.5],[5, 1, 0.5]], columns=['source', 'target', 'weight']) + # df_node = pd.DataFrame([[1, 1, 'label1'], [2, 2, 'label1'], [3, 3, 'label2'], [4, 1, 'label1'], [5, 5, 'label2']], columns=['field1', 'field2', 'field3']) + # print(df_edge.columns) + # print(df_edge.head()) + G = nx.from_pandas_edgelist( + df_edge, + source='subject_id', target='object_id', + create_using=nx.DiGraph()) + + # print("klsdajfbaksbdgksdbagjkg") + # for node in G.nodes: + # nx.set_node_attributes(G, name=node, values={"label" : "test"}) + + node_attrs = {i: {'subset': df_node['label'][i]} for i in G.nodes} + + print() + print() + print("node_attrs: ", node_attrs) + print() + print() + nx.set_node_attributes(G, node_attrs) + + if len(G.nodes) < df_node.shape[0]: + remain_nodes = df_node.iloc[~df_node.index.isin(G.nodes)] + G.add_nodes_from(remain_nodes.index.tolist()) + # # in case arguments not appear in the attacking network + # if len(G.nodes) < df_node.shape[0]: + # remain_nodes = df_node.iloc[~df_node.index.isin(G.nodes)] + # G.add_nodes_from(remain_nodes.index.tolist()) + + if self.graph_layout == GRAPH_LAYOUT[0]: + print("method1") + spasity = (self.node_sparsity + 1) / 11.0 + pos_dict = nx.spring_layout(G, k=spasity, seed=10) + elif self.graph_layout == GRAPH_LAYOUT[1]: + print("method2") + pos_dict = nx.multipartite_layout(G) + elif self.graph_layout == GRAPH_LAYOUT[2]: + print("method3") + pos_dict = nx.kamada_kawai_layout(G) + elif self.graph_layout == GRAPH_LAYOUT[3]: + print("method4") + pos_dict = nx.spectral_layout(G) + + print() + print("dict keys len: ", len(pos_dict)) + print() + print() + print("dict keys: ", pos_dict) + print() + + + self.positions = [] + idx = 1 + for i in sorted(pos_dict.keys()): + if idx <= len(df_edge): + self.positions.append(pos_dict[i]) + idx += 1 + + self.positions = np.array([*self.positions]) + + print() + print("asdasd: ", len(self.positions)) + print() + + + def get_embedding(self): + print("got here!!!!!!") + print(self.positions.ndim) + print() + print("ttt: ", len(self.positions)) + print() + return self.positions # check if the boolean index error stems from here... + + def get_edges(self): + print("ttt2: ", len(self.edge_data)) + print() + return table_to_frame(self.edge_data) + + def get_marked_nodes(self): + return None + + def get_node_labels(self): + print("ttt3: ", len(self.node_data)) + print() + print() + print("node labels!!! ", table_to_frame(self.node_data)['label']) + print() + return table_to_frame(self.node_data)['label'] + + def selection_changed(self): + super().selection_changed() + self.graph.update_edges() + + +def main(): + #network = read_pajek(join(dirname(dirname(__file__)), 'networks', 'leu_by_genesets.net')) + # network = read_pajek(join(dirname(dirname(__file__)), 'networks', 'lastfm.net')) + #network = read_pajek(join(dirname(dirname(__file__)), 'networks', 'Erdos02.net')) + #transform_data_to_orange_table(network) + # WidgetPreview(OWSNDSGDepParser).run(set_graph=network) + WidgetPreview(OWArgExplorer).run() + +if __name__ == "__main__": + main() + + +# def main(argv=sys.argv): +# from AnyQt.QtWidgets import QApplication +# app = QApplication(list(argv)) +# args = app.arguments() +# if len(args) > 1: +# filename = args[1] +# else: +# filename = "iris" + +# ow = OWArgExplorer() +# ow.show() +# ow.raise_() + +# dataset = Table(filename) +# # ow.set_data(dataset) +# ow.handleNewSignals() +# app.exec_() +# # ow.set_data(None) +# # ow.handleNewSignals() +# return 0 + + +# if __name__ == "__main__": +# sys.exit(main()) \ No newline at end of file diff --git a/orangecontrib/storynavigation/widgets/OWSNDSGDepParser.py b/orangecontrib/storynavigation/widgets/OWSNDSGDepParser.py index edfb375..4717afa 100644 --- a/orangecontrib/storynavigation/widgets/OWSNDSGDepParser.py +++ b/orangecontrib/storynavigation/widgets/OWSNDSGDepParser.py @@ -36,7 +36,7 @@ import sys class OWSNDSGDepParser(OWWidget): - name = 'DSG dep-parser' + name = 'Anthology Network Analysis' description = 'Digital Story Grammer: Dutch dependency parsing with Stanza' icon = 'icons/dsg_stanzadep_icon.png' priority = 6430 @@ -197,7 +197,7 @@ def main(): # network = read_pajek(join(dirname(dirname(__file__)), 'networks', 'lastfm.net')) #network = read_pajek(join(dirname(dirname(__file__)), 'networks', 'Erdos02.net')) #transform_data_to_orange_table(network) - # WidgetPreview(OWSNDSGRuleset).run(set_graph=network) + # WidgetPreview(OWSNDSGDepParser).run(set_graph=network) WidgetPreview(OWSNDSGDepParser).run() if __name__ == "__main__": diff --git a/orangecontrib/storynavigation/widgets/OWSNDSGRuleset.py b/orangecontrib/storynavigation/widgets/OWSNDSGRuleset.py deleted file mode 100644 index 2250bd6..0000000 --- a/orangecontrib/storynavigation/widgets/OWSNDSGRuleset.py +++ /dev/null @@ -1,387 +0,0 @@ -from AnyQt.QtCore import QThread, Qt -from AnyQt.QtWidgets import QWidget, QGridLayout -from Orange.widgets.widget import OWWidget, Input, Output, Msg -from orangecontrib.text import Corpus -from Orange.widgets.utils.widgetpreview import WidgetPreview -from Orange.data import ContinuousVariable, Table, Domain -from Orange.widgets import gui, widget -from Orange.widgets.settings import Setting -from Orange.data.pandas_compat import table_from_frame -from typing import Optional -import os -import pandas -import re -import json -import spacy -from spacy.matcher import DependencyMatcher -import sys - -class OWSNDSGRuleset(widget.OWWidget): - name = 'DSG ruleset' - description = 'Digital Story Grammar: Rules for how to decompose sentences into narrative components' - icon = 'icons/dsg_ruleset_icon.png' - priority = 6425 - - resizing_enabled = False - DEBUG = False - NL_SPACY_PIPELINE = "nl_core_news_sm" - NL_DEPENDENCY_PATTERN_FILE = "orangecontrib/storynavigation/widgets/rules/multilingual_dsg_patterns_nl.json" - - class Inputs: - data = Input("Table", Table) - - class Outputs: - table = Output("Table", Table) - - auto_commit = Setting(False) - - def __init__(self): - super().__init__() - - self.data = None - self.table = None - - def check_dict_in_list(self, dict_obj, dict_list): - """Check if a dictionary (partially) matches a list of dictionaries. - - Note: This function is used to avoid duplicate matches (e.g., Subj+Verb in Subj+Verb+Obj) - - Args: - dict_obj (dict): A dictionary object. - dict_list (list): A list of dictionary objects. - - Returns: - bool: True if all non-empty items in dict_obj match the items in any dictionary objects in dict_list, otherwise False. - """ - if dict_obj in dict_list: - return True - - check = [False] * len(dict_obj.keys()) - - for i, key in enumerate(dict_obj.keys()): - if str(dict_obj[key]) == "_": - check[i] = True - next - else: - for ref_dict in dict_list: - if dict_obj[key].i == ref_dict[key].i: - check[i] = True - break - - return all(check) - - def load_spacy_pipeline(self, name): - """Check if the spacy language pipeline was downloaded and load it. - Downloads the language pipeline if not available. - - Args: - name (string): Name of the spacy language. - - Returns: - spacy.language.Language: The spacy language pipeline - """ - if spacy.util.is_package(name): - nlp = spacy.load(name) - else: - os.system(f"spacy download {name}") - nlp = spacy.load(name) - return nlp - - - def create_matcher(self, nlp, pattern_file): - """Create a spacy dependency matcher. - - Args: - nlp (spacy.language.Language): A spacy language pipeline. - pattern_file (str): The path to the dependency pattern .json file for the matcher. - - Returns: - spacy.matcher.DependencyMatcher: A spacy dependency matcher object. - """ - matcher = DependencyMatcher(nlp.vocab, validate=True) - - with open(pattern_file, "r") as file: - patterns = json.load(file) - - for i, pattern in enumerate(patterns): - matcher.add(i, [pattern]) - - return matcher - - def extract_matches(self, doc, matches, matcher, nlp, keys): - """Extract the matched tokens for selected keys. - - Args: - doc (spacy.tokens.Doc): A spacy doc object as returned by a spacy language pipeline. - matches (list): A list of (match_id, token_ids) tuples as returned by a spacy dependency matcher. - matcher (spacy.matcher.DependencyMatcher): A spacy dependency matcher object. - nlp (spacy.language.Language): A spacy language pipeline. - keys (list): A list of keys to which the dependcy matches are assigned. - - Returns: - list: A list of dictionaries that each contain a match of the dependency matcher. - Has the same keys as the `keys` argument. Empty keys contain a spacy token with text='_'. - """ - matches_list = [] - - for l, (match_id, token_ids) in enumerate(matches): - match_dict = {} - - for key in keys: - match_dict[key] = nlp("_")[0] - - for k, token_id in enumerate(token_ids): - key = matcher.get(match_id)[1][0][k]["RIGHT_ID"] - if key in match_dict.keys(): - match_dict[key] = doc[token_id] - - if not self.check_dict_in_list(match_dict, matches_list): - match_dict["match_id"] = match_id - matches_list.append(match_dict) - - return matches_list - - def get_subject_object_verb_table(self, docs, nlp, matcher, keys=["verb", "subj", "obj", "comp", "prep", "aux", "subjadj", "objadj", "obl", "case", "case_arg", "objfixed", ]): - """Construct a pandas dataframe with subjects, verbs, and objects per sentence of documents. - - Args: - docs (list): A list of text strings. - nlp (spacy.language.Language): A spacy language pipeline. - matcher (spacy.matcher.DependencyMatcher): A spacy dependency matcher object. - keys (list): A list of keys to which the dependency matches are assigned. - Defaults to subjects, verbs, and objects. - - Returns: - pandas.DataFrame: A dataframe with a row for each match of the dependency matcher and cols: - doc_id (str): Index of the document in the document list. - sent_id (str): Index of the sentence in the document. - sent (spacy.tokens.Span): A spacy span object with the sentence. - match_id (str): Index of the match in the sentence. - - For each key in the `keys` argument: - key (spacy.tokens.Token): A spacy token object that matches the dependency matcher patterns. - """ - docs_piped = nlp.pipe(docs) - - table_dict = { - "doc_id": [], - "sent_id": [], - "sent": [], - "match_id": [], - "subj": [], - "verb": [], - "obj": [], - "comp": [], - "prep": [], - "aux": [], - "subjadj": [], - "objadj": [], - "obl": [], - "case": [], - "case_arg": [], - "objfixed": [], - } - for i, doc in enumerate(docs_piped): # i: doc index - if self.DEBUG: - for token in doc: - print(token, token.pos_, token.dep_, token.head) - for j, sent in enumerate(doc.sents): # j: sent index - matches = matcher(sent) - matches_list = self.extract_matches( - sent, matches, matcher, nlp, keys=keys) - for l, match in enumerate(matches_list): # l: match index - table_dict["doc_id"].append(str(i)) - table_dict["sent_id"].append(str(j)) - table_dict["sent"].append(sent.text) - table_dict["match_id"].append(str(match["match_id"])) - - for key in keys: - table_dict[key].append(self.append_children_deps(match[key], doc, ["compound", "flat"])) - - # Check for conjuncts, and add table row for each - for conj in match[key].conjuncts: - table_dict["doc_id"].append(str(i)) - table_dict["sent_id"].append(str(j)) - table_dict["sent"].append(sent.text) - table_dict["match_id"].append(str("?")) - table_dict[key].append(conj) - for key_conj in keys: - if key != key_conj: - table_dict[key_conj].append(match[key_conj]) - if self.DEBUG: - print("") - - for i in range(0, len(table_dict["comp"])): - # insert table_dict["comp"][i] in table_dict["verb"][i]) here - pass - - return pandas.DataFrame(table_dict) - - - def get_children_ids(self, token, children_deps, ids): - for child in token.children: - if child.dep_ in children_deps: - ids.append(child.i) - ids = self.get_children_ids(child, children_deps, ids) - return ids - - - def append_children_deps(self, token, doc, children_deps): - """Append children to a token based on dependency tag. - - Note: This function is used to append words of a noun compound. - - Args: - token (spacy.token.Token): A spacy token object. - doc (spacy.token.Doc): A spacy doc object that includes the token. - children_deps (list): A list of dependency tags. - - Returns: - spacy.token.Token: A span of spacy tokens (token argument plus children with specified dependency tags) - if token argument is non-empty, the token argument otherwise. - """ - - if str(token) != "_": - children_match_idx = self.get_children_ids(token, children_deps, [token.i]) - span = doc[min(children_match_idx):max(children_match_idx)+1] - - return span - else: - return "" - - - def combine_rows(self, result_table): - near_duplicate_successive_rows = {} - for i, row in result_table.iterrows(): - if i > 0: - different_columns = [] - for column_name in result_table.loc[i].keys(): - if str(result_table.loc[i][column_name]) != str(result_table.loc[i-1][column_name]): - different_columns.append(column_name) - if len(different_columns) == 1: - near_duplicate_successive_rows[i-1] = different_columns[0] - for i, row in result_table.iterrows(): - if i in near_duplicate_successive_rows: - row[near_duplicate_successive_rows[i]] = str(row[near_duplicate_successive_rows[i]]) + " " + str(result_table.loc[i+1][near_duplicate_successive_rows[i]]) - for i, row in result_table.iterrows(): - if i-1 in near_duplicate_successive_rows: - result_table = result_table.drop(i) - return result_table.reset_index(drop=True) - - - def add_verb_group_column(self, result_table): - subjadj_column = [] - objadj_column = [] - rows_to_be_deleted = [] - for i, row in result_table.iterrows(): - subjadj_column.append(str(row["subjadj"])) - objadj_column.append(str(row["objadj"])) - if i > 0 and str(row["doc_id"]) == str(result_table.loc[i-1]["doc_id"]) and str(row["subjadj"]) != "": - subjadj_column[-2] += " " + str(row["subjadj"]) - rows_to_be_deleted.append(i) - if i > 0 and str(row["doc_id"]) == str(result_table.loc[i-1]["doc_id"]) and str(row["objadj"]) != "": - objadj_column[-2] += " " + str(row["objadj"]) - rows_to_be_deleted.append(i) - - result_table["subjadj"] = subjadj_column - result_table["objadj"] = objadj_column - rows_to_be_deleted = list(set(rows_to_be_deleted)) - rows_to_be_deleted.sort() - for row_id in rows_to_be_deleted: - result_table = result_table.drop(row_id) - verb_group_column = [] - subj_extended_column = [] - obj_extended_column = [] - means_column = [] - for i, row in result_table.iterrows(): - verb_group = str(row["verb"]) - subj_extended = str(row["subj"]) - obj_extended = str(row["obj"]) - means = "" - if str(row["aux"]) != "": - verb_group = str(row["aux"]) + " " + verb_group - if str(row["prep"]) != "": - verb_group += " " + str(row["prep"]) - if str(row["comp"]) != "": - verb_group += " " + str(row["comp"]) - if str(row["subjadj"]) != "": - subj_extended = str(row["subjadj"]) + " " + subj_extended - if str(row["objadj"]) != "": - obj_extended = str(row["objadj"]) + " " + obj_extended - if str(row["objfixed"]) != "": - obj_extended = obj_extended + " " + str(row["objfixed"]) - if str(row["obl"]) != "" and str(row["case"]) != "": - means = str(row["case"]) + " " + str(row["case_arg"]) + " " + str(row["obl"]) - verb_group_column.append(verb_group) - subj_extended_column.append(subj_extended) - obj_extended_column.append(obj_extended) - means_column.append(means) - result_table["verb group"] = verb_group_column - result_table["subj_extended"] = subj_extended_column - result_table["obj_extended"] = obj_extended_column - result_table["means"] = means_column - result_table = result_table[["doc_id", "sent_id", "sent", "match_id", "subj_extended", "verb group", "obj_extended", "means"]] - return result_table - - - def remove_underscores(self, result_table): - for i, row in result_table.iterrows(): - for column_name in result_table.loc[i].keys(): - if str(result_table.loc[i][column_name]) == "_": - result_table.loc[i][column_name] = "" - return result_table - - @Inputs.data - def process_data(self, data: Optional[Table]): - nlp_nl = self.load_spacy_pipeline(self.NL_SPACY_PIPELINE) - matcher_nl = self.create_matcher(nlp_nl, self.NL_DEPENDENCY_PATTERN_FILE) - - if data is not None: - print() - print("hello:") - print() - print(data[0]) - print() - print() - sentences = [ re.sub("\n", " ", str(data[i]["content"])) for i in range(0, len(data)) ] - # sentences = [ re.sub("\n", " ", str(data[i][data[i].domain.index("Text")])) for i in range(0, len(data)) ] - result_table = self.get_subject_object_verb_table(sentences, nlp_nl, matcher_nl) - result_table = self.remove_underscores(self.combine_rows(result_table)) - result_table_combined = self.add_verb_group_column(result_table) - - # a predefined domain is necessary to get consistently formatted output - self.Outputs.table.send(table_from_frame(result_table_combined)) - -def main(): - WidgetPreview(OWSNDSGRuleset).run() - - -if __name__ == "__main__": - main() - -# test without GUI and loading Orange -# ------------------------------------ -# def main(argv=sys.argv): -# from AnyQt.QtWidgets import QApplication -# app = QApplication(list(argv)) -# args = app.arguments() -# if len(args) > 1: -# filename = args[1] -# else: -# filename = "iris" - -# ow = OWSNDSGRuleset() -# ow.show() -# ow.raise_() - -# # dataset = Table(filename) -# # ow.set_data(dataset) -# # ow.handleNewSignals() -# app.exec_() -# # ow.set_data(None) -# # ow.handleNewSignals() -# return 0 - - -# if __name__ == "__main__": -# sys.exit(main()) \ No newline at end of file diff --git a/orangecontrib/storynavigation/widgets/OWSNDSGSRL.py b/orangecontrib/storynavigation/widgets/OWSNDSGSRL.py deleted file mode 100644 index 02254d8..0000000 --- a/orangecontrib/storynavigation/widgets/OWSNDSGSRL.py +++ /dev/null @@ -1,304 +0,0 @@ -from Orange.data import Table -from Orange.widgets import gui -from Orange.widgets.settings import Setting -from Orange.widgets.widget import OWWidget, Input, Output, Msg -from orangecontrib.text import Corpus -from AnyQt.QtWidgets import QWidget -from Orange.widgets.utils.widgetpreview import WidgetPreview -import os -from typing import Optional, Set, List, Tuple, Dict, Any -from Orange.widgets.settings import DomainContextHandler, ContextSetting, Setting -from Orange.data.pandas_compat import table_from_frame -import pandas -import stanza -import stroll.stanza -import re -import sys - -class OWSNDSGSRL(OWWidget): - name = "Stanza NL SRL" - description = "Natural language processing for Dutch with Stanza with semantic role labelling as final step, uses Stroll: https://github.com/Filter-Bubble/stroll" - # category=None - icon = "icons/dsg_stanzasrl_icon.png" - priority = 6488 - - run_nlp = None - - SRL_FIELDS = [ "sent_id", "head_id", "head", "nsubj", "rel", "Arg0", "Arg1", "Arg2", - "ArgM-ADV", "ArgM-CAU", "ArgM-DIS", "ArgM-LOC", "ArgM-MNR", "ArgM-MOD", "ArgM-NEG", "ArgM-REC", "ArgM-TMP", ] - - class Inputs: - corpus = Input("Corpus", Corpus, default=True) - - class Outputs: - table = Output("Table", Table) - - # auto_commit = Setting(False) - - def swap_aux_head(self, sentence_df, child, head, heads_head): - for i in range(0, len(sentence_df)): - if sentence_df.at[i, "id"] == head: - sentence_df.at[i, "head"] = child - elif sentence_df.at[i, "id"] == child: - sentence_df.at[i, "head"] = heads_head - elif sentence_df.at[i, "head"] == head: - sentence_df.at[i, "head"] = child - return sentence_df - - def correct_attachments_sentence(self, sentence_df): - children = {} - xpos = {} - upos = {} - text = {} - heads = {} - for i, row in sentence_df.iterrows(): - child = row["id"] - head = row["head"] - if head not in children: - children[head] = [] - children[head].append(child) - xpos[child] = row["xpos"] - upos[child] = row["upos"] - text[child] = row["text"] - heads[child] = head - for head in children: - if head != 0 and not re.search("^WW", xpos[head]): - for child in children[head]: - if re.search("^WW", xpos[child]) and upos[child] == "AUX": - sentence_df = self.swap_aux_head(sentence_df, child, head, heads[head]) - return sentence_df - - def correct_attachments_table(self, nlp_table_df): - sentence_df = pandas.DataFrame([]) - nlp_table_df_out = pandas.DataFrame([]) - last_id = -1 - for i, row in nlp_table_df.iterrows(): - if row["id"] < last_id: - new_sentence_df = self.correct_attachments_sentence(sentence_df) - if len(nlp_table_df_out) == 0: - nlp_table_df_out = new_sentence_df - else: - nlp_table_df_out = pandas.concat([nlp_table_df_out, new_sentence_df]) - sentence_df = pandas.DataFrame([]) - sentence_df = pandas.concat([sentence_df, pandas.DataFrame([row])], ignore_index=True) - # sentence_df = sentence_df.append(pandas.DataFrame([row]), ignore_index = True) - last_id = row["id"] - if len(sentence_df) > 0: - new_sentence_df = self.correct_attachments_sentence(sentence_df) - if len(nlp_table_df_out) == 0: - nlp_table_df_out = new_sentence_df - else: - nlp_table_df_out = pandas.concat([nlp_table_df_out, new_sentence_df]) - return nlp_table_df_out - - def nlp_analysis_to_table(self, nlp_analysis): - nbr_of_words = 0 - for s in nlp_analysis.sentences: - for w in s.words: - print() - print("word: ", w) - if nbr_of_words == 0: - nlp_table_df = pandas.DataFrame({"id": [w.id], - "text": [w.text], - "lemma": [w.lemma], - "upos": [w.upos], - "xpos": [w.xpos], - "feats": [w.feats], - "head": [w.head], - "deprel": [w.deprel], - "deps": [w.deps], - "misc": [w.misc], - "start_char": [w.start_char], - "end_char": [w.end_char], - "parent": [w.parent], - "sent": [w.sent], - "srl": [w.srl], - "frame": [w.frame], - }) - else: - nlp_table_df.loc[len(nlp_table_df.index)] = [ w.id, w.text, w.lemma, w.upos, w.xpos, w.feats, - w.head, w.deprel, w.deps, w.misc, w.start_char, w.end_char, - w.parent, w.sent, w.srl, w.frame, ] - nbr_of_words += 1 - return nlp_table_df - - def analyze_letter(self, run_nlp, letter_id): - text = self.read_file(letter_id) - print() - print("running nlp...", letter_id) - nlp_analysis = run_nlp(text) - print("finished running nlp.") - print() - print() - print("running nlp analysis...", letter_id) - nlp_table_df = self.nlp_analysis_to_table(nlp_analysis) - print("finished running nlp analysis.") - print() - print("running correct attachments table...", letter_id) - nlp_table_df = self.correct_attachments_table(nlp_table_df) - print("finished running correct attachments table.") - print() - print("running srl analysis...", letter_id) - srl_table_df = self.nlp_table_to_srl_table(nlp_table_df) - print("finished running srl analysis.") - print() - return text, nlp_table_df, srl_table_df - - def nlp_table_to_srl_table(self, nlp_table_df): - srl_table_df = pandas.DataFrame({ field: [] for field in self.SRL_FIELDS }) - srl_data = {} - nlp_data = {} - sentence = {} - last_id = 0 - sent_id = 1 - for i, row in nlp_table_df.iterrows(): - if row['id'] <= last_id: - if len(srl_data) > 0: - self.add_srl_data_to_srl_table(srl_table_df, srl_data, nlp_data, sentence) - sent_id += 1 - srl_data = {} - nlp_data = {} - sentence = {} - if row['srl'] != "_": - if row['head'] not in srl_data: - srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] } - if row['srl'] in srl_data[row['head']]: - print(f"duplicate role for {row['srl']} [{i}]: {srl_data[row['head']][row['srl']]} and {row['lemma']}") - srl_data[row['head']][row['srl']] += " " + row['lemma'] - else: - srl_data[row['head']][row['srl']] = row['lemma'] - if row['frame'] == "rel": - if row['id'] not in srl_data: - srl_data[row['id']] = { "sent_id": sent_id, "head_id": row["id"] } - if row['frame'] not in srl_data[row['id']]: - srl_data[row['id']][row['frame']] = row['lemma'] - else: - srl_data[row['id']][row['frame']] += " " + row['lemma'] - if row['deprel'] == "nsubj": - if row['head'] not in nlp_data: - nlp_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] } - if 'nsubj' in nlp_data[row['head']]: - nlp_data[row['head']]["nsubj"] += " " + row['lemma'] - else: - nlp_data[row['head']]["nsubj"] = row['lemma'] - if row['deprel'] == "compound:prt": - if row['head'] not in nlp_data: - nlp_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] } - if 'head' in nlp_data[row['head']]: - nlp_data[row['head']]["head"] += " " + row['lemma'] - else: - nlp_data[row['head']]["head"] = row['lemma'] - last_id = row['id'] - sentence[row['id']] = row['lemma'] - if len(srl_data) > 0: - self.add_srl_data_to_srl_table(srl_table_df, srl_data, nlp_data, sentence) - return srl_table_df - - def add_srl_data_to_srl_table(self, srl_table_df, srl_data, nlp_data, sentence): - print(srl_data) - for phrase_key in srl_data: - if 'head' in srl_data[phrase_key]: - srl_data[phrase_key]["head"] += " " + sentence[phrase_key] - elif phrase_key > 0: - srl_data[phrase_key]["head"] = sentence[phrase_key] - else: - srl_data[phrase_key]["head"] = "FILLER" - if phrase_key in nlp_data: - srl_table_df.loc[len(srl_table_df)] = self.srl_dict_to_srl_list(srl_data[phrase_key], nlp_data[phrase_key]) - else: - srl_table_df.loc[len(srl_table_df)] = self.srl_dict_to_srl_list(srl_data[phrase_key], {}) - - def srl_dict_to_srl_list(self, srl_dict, nlp_dict): - srl_list = len(self.SRL_FIELDS) * [ "" ] - for i in range(0, len(self.SRL_FIELDS)): - if self.SRL_FIELDS[i] in srl_dict: - srl_list[i] = srl_dict[self.SRL_FIELDS[i]] - if self.SRL_FIELDS[i] in nlp_dict: - srl_list[i] = nlp_dict[self.SRL_FIELDS[i]] - return srl_list - - def read_file(self, in_file_id): - return self.corpus.documents[in_file_id] - - def __init__(self): - super().__init__() - # self.corpus = None - # self.table = None - - @Inputs.corpus - def set_corpus(self, corpus: Optional[Corpus]): - run_nlp = stanza.Pipeline(lang='nl', processors='tokenize,lemma,pos,depparse,srl') - all_nlp_data = pandas.DataFrame([]) - all_srl_data = pandas.DataFrame([]) - - # reset gui - # for i in reversed(range(self.controlArea.layout().count())): - # self.controlArea.layout().itemAt(i).widget().setParent(None) - - self.corpus = corpus - - if hasattr(self, "corpus"): - if self.corpus is None: - print("it is none") - else: - print("it is not none") - if (self.corpus is not corpus): - print("it is different") - else: - print("it is the same") - - if self.corpus is not None: - print("got here") - for letter_id in range(0, len(self.corpus.documents)): - text, nlp_table_df, srl_table_df = self.analyze_letter(run_nlp, letter_id) - all_srl_data = pandas.concat([all_srl_data, srl_table_df]) - all_nlp_data = pandas.concat([all_nlp_data, nlp_table_df]) - - self.Outputs.table.send(table_from_frame(all_srl_data)) - - print() - print() - print("NLP analysis:") - print() - print(all_nlp_data) - print() - print() - print("SRL analysis") - print() - print(all_srl_data) - - - -# def main(): -# WidgetPreview(OWSNDSGSRL).run() - - -# if __name__ == "__main__": -# main() - -# test without GUI and loading Orange -# ------------------------------------ -def main(argv=sys.argv): - from AnyQt.QtWidgets import QApplication - app = QApplication(list(argv)) - args = app.arguments() - if len(args) > 1: - filename = args[1] - else: - filename = "iris" - - ow = OWSNDSGSRL() - ow.show() - ow.raise_() - - # dataset = Table(filename) - # ow.set_data(dataset) - # ow.handleNewSignals() - app.exec_() - # ow.set_data(None) - # ow.handleNewSignals() - return 0 - - -if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file diff --git a/orangecontrib/storynavigation/widgets/OWSNDSGTagger.py b/orangecontrib/storynavigation/widgets/OWSNDSGTagger.py index fd19dd8..57bd545 100644 --- a/orangecontrib/storynavigation/widgets/OWSNDSGTagger.py +++ b/orangecontrib/storynavigation/widgets/OWSNDSGTagger.py @@ -37,12 +37,14 @@ from orangewidget.utils.listview import ListViewSearch from orangecontrib.text.corpus import Corpus + import spacy from spacy import displacy import nltk nltk.download('perluniprops') from nltk.tokenize import sent_tokenize, word_tokenize import dhtmlparser3 +# import neuralcoref HTML = """ @@ -117,11 +119,11 @@ """ + SEPARATOR = ( '' ) - def _count_matches(content: List[str], search_string: str, state: TaskState) -> int: """ Count number of appears of any terms in search_string in content texts. @@ -291,11 +293,10 @@ def set_domain(self, domain): ) super().set_domain(domain) - class OWSNDSGTagger(OWWidget, ConcurrentWidgetMixin): - name = "Word Highlighter" - description = "Identifies named entities and part-of-speech tokens (nouns, adjectives, verbs etc.) in text" - icon = "icons/dsgtagger.svg" + name = "1) Actor Analysis" + description = "Provides tools to support basic narrative analysis for actors in stories." + icon = "icons/dsgtagger.png" priority = 500 NL_SPACY_MODEL = "nl_core_news_lg" @@ -310,24 +311,29 @@ class Outputs: settingsHandler = DomainContextHandler() settings_version = 2 + search_features: List[Variable] = ContextSetting([]) + display_features: List[Variable] = ContextSetting([]) + selected_documents: Set[int] = Setting({0}, schema_only=True) + regexp_filter = ContextSetting("") + show_tokens = Setting(False) + autocommit = Setting(True) + # Scoring related to agent prominence score agent_prominence_score_max = 0. agent_prominence_score_min = 0. - agent_prominence_metrics = ['Unique nouns', 'Unique words', 'Sentence subjects', 'Sentence subjects norm'] - agent_prominence_metric = 'Unique nouns' + agent_prominence_metrics = ['Raw frequency', 'Subject frequency', 'Subject frequency (normalized)', 'Subject agency', 'Subject agency prominence'] + agent_prominence_metric = 'Subject frequency' + # Index of word prominence scores for each word in story word_prominence_scores = {} + + # HTML string rendering of story document html_result = '' + # POS or NER? radiobutton selection of entity type to highlight tag_type = Setting(1) - search_features: List[Variable] = ContextSetting([]) - display_features: List[Variable] = ContextSetting([]) - selected_documents: Set[int] = Setting({0}, schema_only=True) - regexp_filter = ContextSetting("") - show_tokens = Setting(False) - autocommit = Setting(True) - - # POS + + # Parts of speech (POS) checkbox selected initialization vbz = Setting(True) nouns = Setting(True) adj = Setting(True) @@ -342,7 +348,7 @@ class Outputs: all_pos = Setting(True) zero_pos = Setting(False) - # NER + # Named entity recognition (NER) types checkbox selected initialization per = Setting(True) loc = Setting(True) gpe = Setting(True) @@ -362,24 +368,42 @@ class Outputs: ordinal = Setting(True) cardinal = Setting(True) - # panels for pos and ner tag selection + # Panels for POS and NER tag types or lists postags_box = None nertags_box = None main_agents_box = None + + # list of Dutch stopwords nl_stopwords = [] - # pos counts + # POS counts initialisation noun_count = 0 verb_count = 0 adjective_count = 0 + + # Other counts initialisation word_count = 0 word_count_nostops = 0 sentence_count = 0 sentence_count_per_word = {} count_per_word = {} + count_per_word_passive = {} + count_per_word_active = {} + noun_action_dict = {} + + # original text (not tagged) + original_text = '' + sli = None + + # list of colour values for the background highlight for each entity type highlight_colors = {} + + # list of punctuation characters punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~0123456789''' + + # list of POS checkboxes for each POS type + pos_checkboxes = [] class Warning(OWWidget.Warning): no_feats_search = Msg("No features included in search.") @@ -389,35 +413,26 @@ def __init__(self): super().__init__() ConcurrentWidgetMixin.__init__(self) - with open('orangecontrib/storynavigation/widgets/utils/dutchstopwords.txt', 'r', encoding='utf8') as f: + # loads list of Dutch stopwords + with open('orangecontrib/storynavigation/utils/dutchstopwords.txt', 'r', encoding='utf8') as f: self.nl_stopwords = [line.rstrip() for line in f] - - # print() - # print() - # print(self.nl_stopwords) - # print() - # print() - self.corpus = None # Corpus - self.nlp_nl = None + self.corpus = None # initialise list of documents (corpus) + self.nlp_nl = None # initialise spacy model self.__pending_selected_documents = self.selected_documents # Search features ex_sel = QListView.ExtendedSelection - # search_box = gui.widgetBox(self.controlArea, "Search features") self.search_listbox = sl = VariableListViewSearch(selectionMode=ex_sel) - # search_box.layout().addWidget(sl) sl.setModel(VisibleDomainModel(separators=False)) sl.selectionModel().selectionChanged.connect(self.search_features_changed) # Display features - # display_box = gui.widgetBox(self.controlArea, "Display features") self.display_listbox = dl = VariableListViewSearch(selectionMode=ex_sel) - # display_box.layout().addWidget(dl) dl.setModel(VisibleDomainModel(separators=False)) dl.selectionModel().selectionChanged.connect(self.display_features_changed) - # Tag type selection + # Tag type selection panel tag_type_panel = gui.widgetBox(self.controlArea, "Category of words to highlight:", orientation=Qt.Horizontal,sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed)) self.tagtype_box = box = gui.radioButtonsInBox(self.controlArea, self, "tag_type", [], callback=self._tagtype_changed) self.named_entities = gui.appendRadioButton(box, "Named Entities") @@ -426,42 +441,46 @@ def __init__(self): # POS tag list self.postags_box = gui.vBox(self.controlArea, "Parts of Speech to highlight:", sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed)) - gui.checkBox(self.postags_box, self, "vbz", "Verbs",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "nouns", "Nouns",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "propn", "Proper nouns",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "adj", "Adjectives",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "adp", "Prepositions / Postpositions",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "adv", "Adverbs",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "conj", "Conjunctives",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "det", "Determinative",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "num", "Numericals",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "prt", "Particles",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "pron", "Personal pronouns",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "all_pos", "All",callback=self.pos_selection_changed) - gui.checkBox(self.postags_box, self, "zero_pos", "None",callback=self.pos_selection_changed) - + self.vc = gui.checkBox(self.postags_box, self, "vbz", "Actions",callback=self.pos_selection_changed) + self.nc = gui.checkBox(self.postags_box, self, "nouns", "Entities",callback=self.pos_selection_changed) + # self.propnc = gui.checkBox(self.postags_box, self, "propn", "Proper nouns",callback=self.pos_selection_changed) + self.adjc = gui.checkBox(self.postags_box, self, "adj", "Descriptives",callback=self.pos_selection_changed) + # self.adpc = gui.checkBox(self.postags_box, self, "adp", "Prepositions / Postpositions",callback=self.pos_selection_changed) + # self.advc = gui.checkBox(self.postags_box, self, "adv", "Adverbs",callback=self.pos_selection_changed) + # self.conjc = gui.checkBox(self.postags_box, self, "conj", "Conjunctives",callback=self.pos_selection_changed) + # self.detc = gui.checkBox(self.postags_box, self, "det", "Determinative",callback=self.pos_selection_changed) + # self.numc = gui.checkBox(self.postags_box, self, "num", "Numericals",callback=self.pos_selection_changed) + # self.prtc = gui.checkBox(self.postags_box, self, "prt", "Particles",callback=self.pos_selection_changed) + # self.pronc = gui.checkBox(self.postags_box, self, "pron", "Personal pronouns",callback=self.pos_selection_changed) + self.allc = gui.checkBox(self.postags_box, self, "all_pos", "All") + self.allc.setChecked(False) + self.allc.stateChanged.connect(self.on_state_changed_pos) + + # self.pos_checkboxes = [self.vc, self.nc, self.propnc, self.adjc, self.adpc, self.advc, self.conjc, self.detc, self.numc, self.prtc, self.pronc] + self.pos_checkboxes = [self.vc, self.nc, self.adjc] + # gui.checkBox(self.postags_box, self, "zero_pos", "None",callback=self.pos_selection_changed) self.controlArea.layout().addWidget(self.postags_box) # NER tag list self.nertags_box = gui.vBox(self.controlArea, "Named entities to highlight:", sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed)) gui.checkBox(self.nertags_box, self, "per", "People",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "gpe", "Countries, cities, regions",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "loc", "Other kinds of locations",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "norp", "Nationalities and religious or political groups",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "fac", "Buildings, airports, highways, bridges etc.",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "org", "Companies, agencies, institutions, etc.",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "product", "Objects, vehicles, foods, etc. (Not services)",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "eventner", "Named hurricanes, battles, wars, sports events, etc.",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "work_of_art", "Titles of books, songs, etc.",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "law", "Named documents made into laws",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "language", "Any named language",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "date", "Absolute or relative dates or periods",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "time", "Times smaller than a day",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "percent", "Percentages",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "money", "Monetary values",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "quantity", "Measurements, as of weight or distance",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "ordinal", "'first', 'second', etc.",callback=self.ner_selection_changed) - gui.checkBox(self.nertags_box, self, "cardinal", "Numerals that do not fall under another category",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "gpe", "Countries, cities, regions",callback=self.ner_selection_changed) + gui.checkBox(self.nertags_box, self, "loc", "Places",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "norp", "Nationalities and religious or political groups",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "fac", "Buildings, airports, highways, bridges etc.",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "org", "Companies, agencies, institutions, etc.",callback=self.ner_selection_changed) + gui.checkBox(self.nertags_box, self, "product", "Other entities",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "eventner", "Named hurricanes, battles, wars, sports events, etc.",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "work_of_art", "Titles of books, songs, etc.",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "law", "Named documents made into laws",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "language", "Any named language",callback=self.ner_selection_changed) + gui.checkBox(self.nertags_box, self, "date", "Temporals",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "time", "Times smaller than a day",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "percent", "Percentages",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "money", "Monetary values",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "quantity", "Measurements, as of weight or distance",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "ordinal", "'first', 'second', etc.",callback=self.ner_selection_changed) + # gui.checkBox(self.nertags_box, self, "cardinal", "Numerals that do not fall under another category",callback=self.ner_selection_changed) self.controlArea.layout().addWidget(self.nertags_box) self.nertags_box.setEnabled(False) @@ -520,8 +539,13 @@ def __init__(self): self.doc_list.selectionModel().selectionChanged.connect(self.selection_changed) # Document contents self.doc_webview = gui.WebviewWidget(self.splitter, debug=False) + # self.doc_webview.setStyleSheet("QWidget {background-color: #0ff}") self.mainArea.layout().addWidget(self.splitter) + def on_state_changed_pos(self, state): + for checkBox in self.pos_checkboxes: + checkBox.setCheckState(state) + def copy_to_clipboard(self): text = self.doc_webview.selectedText() QApplication.clipboard().setText(text) @@ -541,9 +565,16 @@ def load_spacy_pipeline(self, name): else: os.system(f"spacy download {name}") nlp = spacy.load(name) + nlp.add_pipe("merge_noun_chunks") + nlp.add_pipe("merge_entities") return nlp def _tagtype_changed(self): + """ Toggles the disabling and enabling of the list of checkboxes associated with + Parts of Speech vs. Named Entities. The user cannot highlight entities of both + categories. When POS tags are selected, the list of NER checkboxes are disabled + and vice versa. This function takes care of this. + """ if self.tag_type == 1: self.postags_box.setEnabled(True) self.main_agents_box.setEnabled(True) @@ -571,6 +602,7 @@ def rehighlight_entities(self): @Inputs.corpus def set_data(self, corpus=None): self.nlp_nl = self.load_spacy_pipeline(self.NL_SPACY_MODEL) + self.closeContext() self.reset_widget() self.corpus = corpus @@ -679,6 +711,9 @@ def selection_changed(self) -> None: self.commit.deferred() def filter_entities(self): + # print() + # print("got here!!!!!!!!!!!!") + # print() if ((len(self.word_prominence_scores) == 0) or (self.html_result == '')): self.show_docs(slider_engaged=False) self.commit.deferred() @@ -716,37 +751,34 @@ def filter_entities(self): def calculate_prominence_score(self, word, list_of_sentences, tags): score = 0 - if (self.agent_prominence_metric == "Sentence subjects"): + if (self.agent_prominence_metric == "Subject frequency"): as_subj_count = 0 for tag in tags: - if tag[3] == 'nsubj': + if tag[3] == 'nsubj' or tag[3] == 'nsubj:pass': as_subj_count += 1 score = (as_subj_count / len(list_of_sentences)) * 100 - # print("subjects score: ", score) - elif (self.agent_prominence_metric == "Sentence subjects norm"): + elif (self.agent_prominence_metric == "Subject frequency (normalized)"): as_subj_count = 0 for tag in tags: - if tag[3] == 'nsubj': + if tag[3] == 'nsubj' or tag[3] == 'nsubj:pass': as_subj_count += 1 score = (as_subj_count / self.sentence_count_per_word[word]) * 100 - # print("norm score: ", score) - - elif (self.agent_prominence_metric == "Unique nouns"): - score = (self.count_per_word[word] / self.noun_count) * 100 - # print("unique nouns score: ", score) - elif (self.agent_prominence_metric == "Unique words"): - score = (self.count_per_word[word] / self.word_count_nostops) * 100 - # print("unique words score: ", score) + + elif (self.agent_prominence_metric == "Raw frequency"): + score = self.count_per_word[word] + elif (self.agent_prominence_metric == "Subject agency"): + if (self.count_per_word_active[word] + self.count_per_word_passive[word]) > 0: + score = (self.count_per_word_active[word] - self.count_per_word_passive[word])/(self.count_per_word_active[word] + self.count_per_word_passive[word]) + elif (self.agent_prominence_metric == "Subject agency prominence"): + if (self.count_per_word_active[word] + self.count_per_word_passive[word]) > 0: + score = self.count_per_word[word] * ((self.count_per_word_active[word] - self.count_per_word_passive[word])/(self.count_per_word_active[word] + self.count_per_word_passive[word])) elif (self.agent_prominence_metric == "Persons only"): - score = 0 - # print("persons only score: ", score) else: score = 0 - # print("other score: ", score) return score @@ -771,9 +803,6 @@ def slider_callback(self): self.commit.deferred() def show_docs(self, slider_engaged = False): - # print() - # print(self.agent_prominence_metric) - # print() """Show the selected documents in the right area""" if self.corpus is None: return @@ -782,14 +811,20 @@ def show_docs(self, slider_engaged = False): if len(self.display_features) == 0: self.Warning.no_feats_display() - if self.show_tokens: - tokens = list(self.corpus.ngrams_iterator(include_postags=True)) + # if self.show_tokens: + # tokens = list(self.corpus.ngrams_iterator(include_postags=True)) parts = [] for doc_count, c_index in enumerate(sorted(self.selected_documents)): text = "" for feature in self.display_features: value = str(self.corpus[c_index, feature.name]) + self.original_text = str(value) + # print() + # print() + # print(value) + # print() + # print() if feature.name == 'content': if (self.tag_type == 1): @@ -797,16 +832,42 @@ def show_docs(self, slider_engaged = False): value = self.filter_entities() else: value = self.__postag_text(value) + # print() + # print() + # print(value) + # print() + # print() + else: value = self.__nertag_text(value) - if feature in self.search_features: - value = self.__mark_text(value) + # print() + # print("feature: ", feature) + # print() + # print() + # print("self.search_features: ", self.search_features) + # print() + # print() + if feature in self.search_features and (len(self.regexp_filter) > 0): + value = self.__mark_text(self.original_text) + # print() + # print() + # print("marking text???? why????") + # print() + # print(value) + # print() + # print() if feature.name != 'content': value = value.replace("\n", "
") + # print() + # print() + # print("doing replace!") + # print() + # print() is_image = feature.attributes.get("type", "") == "image" + if is_image and value != "?": value = os.path.join(feature.attributes.get("origin", ""), value) value = ''.format(value) @@ -816,17 +877,20 @@ def show_docs(self, slider_engaged = False): f'{value}' ) - if self.show_tokens: - tokens_ = "".join( - f'{token}' for token in tokens[c_index] - ) - text += ( - f'Tokens & Tags:' - f"{tokens_}" - ) + # if self.show_tokens: + # tokens_ = "".join( + # f'{token}' for token in tokens[c_index] + # ) + # text += ( + # f'Tokens & Tags:' + # f"{tokens_}" + # ) + parts.append(text) parts.append(self.get_word_prominence_bar_chart_html()) - + parts.append(self.generate_noun_action_table(self.noun_action_dict)) + # parts.append(text) + joined = SEPARATOR.join(parts) html = f"{joined}
" base = QUrl.fromLocalFile(__file__) @@ -847,15 +911,15 @@ def get_word_prominence_bar_chart_html(self): plt.bar(range(len(res)), values, tick_label=names) plt.xticks(rotation=45) # Option 1 - plt.rcParams['font.size'] = 8 + plt.rcParams['font.size'] = 12 # Option 2 # plt.rcParams.update({'font.size': 18}) tmpfile = BytesIO() plt.savefig(tmpfile, format='png') plt.tight_layout(rect=[0.05, 0.05, 1, 0.75]) encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8') - - html = ''.format(encoded) + html = """
Top 10 entity prominence scores
""".format(encoded) + # html = """
Top 10 entity prominence scores
""".format(encoded) plt.figure().clear() plt.close() @@ -863,6 +927,37 @@ def get_word_prominence_bar_chart_html(self): plt.clf() return html + + def generate_noun_action_table(self, noun_action_dict, ): + import pandas as pd + from operator import itemgetter + + n=10 + res = dict(sorted(self.word_prominence_scores.items(), key = itemgetter(1), reverse = True)[:n]) + names = list(res.keys()) + + rows = [] + for item in noun_action_dict: + if len(noun_action_dict[item]) > 0 and (item in names): + curr_row = [] + curr_row.append(item) + curr_row.append(', '.join(list(set(noun_action_dict[item])))) + rows.append(curr_row) + + df = pd.DataFrame(rows, columns = ['actor', 'actions']) + + # html = """
Top 10 entity prominence scores
""".format(encoded) + # styles = [dict(selector="caption", props=[("background-color", "cyan")])] + + # df.style.set_caption('Top 10 most prominent actors and their associated actions') + test_html = df.to_html(index=False, justify='center') + test_html = test_html.replace('', 'Top 10 most prominent actors and their associated actions') + # print() + # print() + # print(test_html) + # print() + # print() + return test_html def calculate_word_type_count(self, sent_models): n = set() @@ -880,7 +975,8 @@ def calculate_word_type_count(self, sent_models): nopunct_token += ele if ((token.text.lower().strip() not in self.nl_stopwords) and (len(nopunct_token) == len(token.text.strip()))): - self.count_per_word[token.text] += 1 + self.count_per_word[token.text.lower().strip()] += 1 + w.add(token.text) if token.pos_ in ['NOUN', 'PRON', 'PROPN']: n.add(token.text) @@ -888,6 +984,17 @@ def calculate_word_type_count(self, sent_models): a.add(token.text) if token.pos_ in ['VERB']: v.add(token.text) + + if token.dep_ == 'nsubj' or (token.text.lower().strip() == 'ik'): + if token.text.lower().strip() in self.count_per_word_active: + self.count_per_word_active[token.text.lower().strip()] += 1 + else: + self.count_per_word_active[token.text.lower().strip()] = 1 + if token.dep_ in ['nsubj:pass', 'obj', 'iobj', 'obl:agent', 'obl', 'parataxis']: + if token.text.lower().strip() in self.count_per_word_passive: + self.count_per_word_passive[token.text.lower().strip()] += 1 + else: + self.count_per_word_passive[token.text.lower().strip()] = 1 self.noun_count = len(n) self.word_count = len(w) @@ -903,38 +1010,18 @@ def __nertag_text(self, text): ner_tags.append("PERSON") if (self.loc): ner_tags.append("LOC") - if (self.gpe): ner_tags.append("GPE") - if (self.norp): ner_tags.append("NORP") - if (self.fac): ner_tags.append("FAC") - if (self.org): ner_tags.append("ORG") if (self.product): + ner_tags.append("ORG") ner_tags.append("PRODUCT") - if (self.eventner): ner_tags.append("EVENT") - if (self.work_of_art): ner_tags.append("WORK_OF_ART") - if (self.law): - ner_tags.append("LAW") - if (self.language): - ner_tags.append("LANGUAGE") if (self.date): ner_tags.append("DATE") - if (self.time): ner_tags.append("TIME") - if (self.percent): - ner_tags.append("PERCENT") - if (self.money): - ner_tags.append("MONEY") - if (self.quantity): - ner_tags.append("QUANTITY") - if (self.ordinal): - ner_tags.append("ORDINAL") - if (self.cardinal): - ner_tags.append("CARDINAL") options = {"ents" : ner_tags, "colors" : {}} @@ -946,6 +1033,39 @@ def __nertag_text(self, text): html += displacy.render(tagged_sentence, style="ent", options = options) return html + + # Function to recursively traverse ancestors + def find_verb_ancestor(self, token): + # Check if the token is a verb + if token.pos_ == 'VERB': + return token + + # Traverse the token's ancestors recursively + for ancestor in token.ancestors: + # Recursive call to find the verb ancestor + verb_ancestor = self.find_verb_ancestor(ancestor) + if verb_ancestor: + return verb_ancestor + + # If no verb ancestor found, return None + return None + + def merge_punct(self, doc): + spans = [] + for word in doc[:-1]: + if word.is_punct or not word.nbor(1).is_punct: + continue + start = word.i + end = word.i + 1 + while end < len(doc) and doc[end].is_punct: + end += 1 + span = doc[start:end] + spans.append((span, word.tag_, word.lemma_, word.ent_type_)) + with doc.retokenize() as retokenizer: + for span, tag, lemma, ent_type in spans: + attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type} + retokenizer.merge(span, attrs=attrs) + return doc def __postag_text(self, text): # pos tags that the user wants to highlight @@ -956,23 +1076,10 @@ def __postag_text(self, text): pos_tags.append("VERB") if (self.adj): pos_tags.append("ADJ") + pos_tags.append("ADV") if (self.nouns): pos_tags.append("NOUN") - if (self.pron): pos_tags.append("PRON") - if (self.adp): - pos_tags.append("ADP") - if (self.adv): - pos_tags.append("ADV") - if (self.conj): - pos_tags.append("CONJ") - if (self.det): - pos_tags.append("DET") - if (self.num): - pos_tags.append("NUM") - if (self.prt): - pos_tags.append("PRT") - if (self.propn): pos_tags.append("PROPN") # tokenize input text into sentences @@ -985,10 +1092,15 @@ def __postag_text(self, text): from spacy.lang.nl import Dutch nlp = Dutch() tokenizer = nlp.tokenizer + + for i in range(0, len(sents)): + sents[i] = sents[i].replace('.', '') + for sentence in sents: sentence = sentence.replace("\n"," ") sentence = sentence.replace(" "," ") sentence = re.sub('\s+',' ',sentence) + sentence = sentence.replace('.', '') # tokens = word_tokenize(sentence, language='dutch') tokens_doc = tokenizer(sentence) @@ -998,8 +1110,11 @@ def __postag_text(self, text): self.word_count += len(tokens) for token in tokens: - self.sentence_count_per_word[token] = 0 - self.count_per_word[token] = 0 + self.sentence_count_per_word[token.lower().strip()] = 0 + self.count_per_word[token.lower().strip()] = 0 + self.count_per_word_active[token.lower().strip()] = 0 + self.count_per_word_passive[token.lower().strip()] = 0 + self.noun_action_dict[token.lower().strip()] = [] if token.lower().strip() not in self.nl_stopwords: self.word_count_nostops += 1 @@ -1008,6 +1123,7 @@ def __postag_text(self, text): sentence = sentence.replace("\n"," ") sentence = sentence.replace(" "," ") sentence = re.sub('\s+',' ',sentence) + sentence = sentence.replace('.', '') # tokens = word_tokenize(sentence, language='dutch') tokens_doc = tokenizer(sentence) tokens = [] @@ -1015,7 +1131,7 @@ def __postag_text(self, text): tokens.append(d.text) for token in tokens: - self.sentence_count_per_word[token] += 1 + self.sentence_count_per_word[token.lower().strip()] += 1 # output of this function html = "" @@ -1026,18 +1142,23 @@ def __postag_text(self, text): sentence = sentence.replace("\n"," ") sentence = sentence.replace(" "," ") sentence = re.sub('\s+',' ',sentence) + sentence = sentence.replace('.', '') tagged_sentence = self.nlp_nl(sentence) + # tagged_sentence = self.merge_punct(tagged_sentence) sentence_nlp_models.append(tagged_sentence) # calculate the number of unique nouns in the text self.calculate_word_type_count(sentence_nlp_models) - # loop through model to filter out those words that need to be tagged (bsaed on user selection and prominence score) + # loop through model to filter out those words that need to be tagged (based on user selection and prominence score) for sentence, tagged_sentence in zip(sents, sentence_nlp_models): tags = [] for token in tagged_sentence: tags.append((token.text, token.pos_, token.tag_, token.dep_)) + + # for ent in tagged_sentence.ents: + # print(ent, " : ", ent._.coref_cluster) from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+') @@ -1045,6 +1166,7 @@ def __postag_text(self, text): ents = [] for tag, span in zip(tags, spans): + if tag[0].lower().strip() not in self.nl_stopwords: if tag[1] in pos_tags: if tag[1] == 'PRON': @@ -1052,13 +1174,22 @@ def __postag_text(self, text): tmp_tags = tag[2].split('|') if ((tmp_tags[1] == 'pers' and tmp_tags[2] == 'pron') or (tmp_tags[1] == 'pr' and tmp_tags[2] == 'pron') or (tmp_tags[1] == 'bez' and tmp_tags[2] == 'det')): p_score = 0 - p_score = self.calculate_prominence_score(tag[0], sents, tags) - self.word_prominence_scores[tag[0].lower()] = p_score + p_score = self.calculate_prominence_score(tag[0].lower().strip(), sents, tags) + self.word_prominence_scores[tag[0].lower().strip()] = p_score if (p_score >= self.agent_prominence_score_min): + print("word: ", tag[0]) + print("tag: ", tag[1]) ents.append({"start" : span[0], "end" : span[1], "label" : tag[1] }) + + # print("noun: ", token.text) + # print("sentence: ", sentence) + # # print("testing: ", self.find_verb_ancestor(token).text) + vb = self.find_verb_ancestor(token) + if vb is not None: + self.noun_action_dict[tag[0].lower().strip()].append(vb.text) # ents.append({"start" : span[0], # "end" : span[1], @@ -1066,15 +1197,32 @@ def __postag_text(self, text): elif ((tag[1] == 'NOUN') or (tag[1] == 'PROPN')): p_score = 0 - p_score = self.calculate_prominence_score(tag[0], sents, tags) - self.word_prominence_scores[tag[0].lower()] = p_score + p_score = self.calculate_prominence_score(tag[0].lower().strip(), sents, tags) + self.word_prominence_scores[tag[0].lower().strip()] = p_score if (p_score >= self.agent_prominence_score_min): + print("word: ", tag[0]) + print("tag: ", tag[1]) ents.append({"start" : span[0], "end" : span[1], "label" : tag[1] }) + + # print() + # print('t: ', tag[1]) + # print() + + + # print("noun2: ", token.text) + # print("sentence2: ", sentence) + # # print("testing2: ", self.find_verb_ancestor(token).text) + vb = self.find_verb_ancestor(token) + if vb is not None: + self.noun_action_dict[tag[0].lower().strip()].append(vb.text) else: + # print() + # print('t2: ', tag[1]) + # print() ents.append({"start" : span[0], "end" : span[1], "label" : tag[1] }) @@ -1102,8 +1250,12 @@ def __postag_text(self, text): html += displacy.render(doc, style = "ent", options = options, manual = True) # print() + # print("noun action dict:") + # print() + # for item in self.noun_action_dict: + # if len(self.noun_action_dict[item]) > 0: + # print(item, " : ", self.noun_action_dict[item]) # print() - # print(html) # print() # print() diff --git a/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py b/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py new file mode 100644 index 0000000..384807c --- /dev/null +++ b/orangecontrib/storynavigation/widgets/OWSNNarrativeNetwork.py @@ -0,0 +1,302 @@ +import os +import re +import sre_constants +from typing import Any, Iterable, List, Set +import numpy as np +import scipy.sparse as sp + +from Orange.data import Table, Domain, StringVariable +from Orange.widgets import gui +from Orange.widgets.settings import ContextSetting, Setting, DomainContextHandler +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin +from Orange.widgets.widget import Input, Msg, Output, OWWidget +from Orange.data.pandas_compat import table_from_frame +from orangecontrib.text.corpus import Corpus +from orangecontrib.network.network import Network + +import spacy +import nltk +nltk.download('perluniprops') +from nltk.tokenize import sent_tokenize, word_tokenize +import pandas as pd + +class OWSNNarrativeNetwork(OWWidget, ConcurrentWidgetMixin): + name = 'Generate Narrative Network' + description = 'Generates a network of entities and story units for visualisation' + icon = 'icons/narrative_network_icon.png' + priority = 6425 + + NL_SPACY_MODEL = "nl_core_news_lg" + + class Inputs: + corpus = Input("Corpus", Corpus, replaces=["Data"]) + + class Outputs: + edge_data = Output('Edge Data', Table) + node_data = Output('Node Data', Table) + network = Output('Network', Network) + + settingsHandler = DomainContextHandler() + settings_version = 2 + autocommit = Setting(True) + + def __init__(self): + super().__init__() + ConcurrentWidgetMixin.__init__(self) + + self.corpus = None # initialise list of documents (corpus) + self.nlp_nl = None # initialise spacy model + + def load_spacy_pipeline(self, name): + """Check if the spacy language pipeline was downloaded and load it. + Downloads the language pipeline if not available. + + Args: + name (string): Name of the spacy language. + + Returns: + spacy.language.Language: The spacy language pipeline + """ + if spacy.util.is_package(name): + nlp = spacy.load(name) + else: + os.system(f"spacy download {name}") + nlp = spacy.load(name) + return nlp + + @Inputs.corpus + def set_data(self, corpus=None): + self.nlp_nl = self.load_spacy_pipeline(self.NL_SPACY_MODEL) + # self.nlp_nl.add_pipe("merge_noun_chunks") + self.corpus = corpus + self._generate_network(self.corpus) + + def reset_widget(self): + self.corpus = None + self.Warning.clear() + + def encode_data(self, data): + """ + Encodes categorical data (subject, object, verb strings) into numerical identifiers + this is required in order to generate network data that is in the format expected + by the orange-network addon + + Parameters + ---------- + + data : list of lists, + A table of subject, object, action tuples in list of lists format + (each list in the master list is a row of the table) + + Returns + ------- + + result : list of lists, + The data from the original table, plus four new columns + sentence_id, subject_id, object_id and action_id providing encoded + identifiers for the subject, object and action strings + """ + + # convert data into dataframe + df = pd.DataFrame(data, columns = ['story_id', 'sentence', 'subject', 'action', 'object']) + + # initialise dictionary of encoded identifiers + identifiers = {} + # generate list of unique strings from the table data + list_of_strings = [] + list_of_entity_types = [] + + subjects = list(set(df['subject'].tolist())) + subject_types = ['subject'] * len(subjects) + actions = list(set(df['action'].tolist())) + action_types = ['action'] * len(actions) + objects = list(set(df['object'].tolist())) + object_types = ['object'] * len(objects) + + list_of_strings.extend(subjects) + list_of_strings.extend(actions) + list_of_strings.extend(objects) + list_of_entity_types.extend(subject_types) + list_of_entity_types.extend(action_types) + list_of_entity_types.extend(object_types) + + node_df = pd.DataFrame() + node_df['labels'] = list_of_strings + node_df['types'] = list_of_entity_types + + # encode strings + idx = 0 + vals = [] + for item in list_of_strings: + identifiers[item] = idx + vals.append(idx) + idx += 1 + + node_df['node_id'] = vals + print("node: ", len(vals)) + + result = [] + node_labels = [] + for row in data: + new_row = [] + new_row.append(row[0]) # append story id + # new_row.append(identifiers[row[1]]) # append sentence encoding + new_row.append(identifiers[row[2]]) # append subject encoding + new_row.append(identifiers[row[3]]) # append action encoding + new_row.append(identifiers[row[4]]) # append object encoding + result.append(new_row) # add new row with additional columns of data to return variable + + for item in identifiers.keys(): + node_labels.append([identifiers[item], item]) + + return result, node_df.to_numpy().tolist() + + def _traverse_ancestors_recursive(self, token, results_s, results_o): + # Base case: No more ancestors to traverse + if not token.ancestors: + return + + # Traverse ancestors recursively until 'nsubj' is found or no more ancestors are left + for ancestor in token.ancestors: + # print('ancestor: ', ancestor, ' dep: ', ancestor.dep_, ' pos: ', ancestor.pos_) + if ancestor.dep_ == 'nsubj' or ancestor.dep_ == 'nsubj:pass' or ancestor.dep_ == 'csubj': + results_s.append(ancestor.text) + elif ancestor.dep_ == 'obj' or ancestor.dep_ == 'iobj' or ancestor.dep_ == 'obl' or ancestor.dep_ == 'obl:agent': + results_o.append(ancestor.text) + self._traverse_ancestors_recursive(ancestor, results_s, results_o) + + def _traverse_children_recursive(self, token, results_s, results_o): + # Base case: No more ancestors to traverse + if not token.children: + return + + # Traverse ancestors recursively until 'nsubj' is found or no more ancestors are left + for child in token.children: + # print('child: ', child, ' dep: ', child.dep_, ' pos: ', child.pos_) + if child.dep_ == 'nsubj' or child.dep_ == 'nsubj:pass' or child.dep_ == 'csubj': + results_s.append(child.text) + elif child.dep_ == 'obj' or child.dep_ == 'iobj' or child.dep_ == 'obl' or child.dep_ == 'obl:agent': + results_o.append(child.text) + self._traverse_children_recursive(child, results_s, results_o) + + def _get_tuples(self, doc, input_word): + """ + Traverses dependency tree to find subjects or objects associated with input deontic (for the legal obligation) + """ + verb = input_word #extract_verb_with_aux(sentence, input_word) + + # Find the input word in the sentence + token = None + for t in doc: + if t.text == verb.text.lower(): + token = t + break + + if token is None: + return [], [] + + results_s_a = [] + results_o_a = [] + results_s_c = [] + results_o_c = [] + self._traverse_ancestors_recursive(token, results_s_a, results_o_a) + self._traverse_children_recursive(token, results_s_c, results_o_c) + + sv_tuples = [] + vo_tuples = [] + for item in results_s_a + results_s_c: + sv_tuples.append((item, verb.text)) + for item in results_o_a + results_o_c: + vo_tuples.append((verb.text, item)) + + return sv_tuples, vo_tuples + + def _merge_binary_tuplelsts_into_ternary_tuplelst(self, list1, list2): + merged_list = [] + for tuple1 in list1: + foundMatch = False + for tuple2 in list2: + if tuple1[1] == tuple2[0]: + foundMatch = True + merged_list.append((tuple1[0], tuple1[1], tuple2[1])) + if not foundMatch: + merged_list.append((tuple1[0], tuple1[1], 'O')) + + return merged_list + + def _generate_network(self, texts): + text_id = 0 + tmp_data = [] + for i in range(0, len(texts)): + txt = str(texts[i, 'content']) + print(len(str(txt))) + sents = sent_tokenize(txt, language='dutch') + + for sent in sents: + tagged_sentence = self.nlp_nl(sent) + + for token in tagged_sentence: + if ('WW' in token.tag_.split('|')): + sv_tuples, vo_tuples = self._get_tuples(tagged_sentence, token) + svo_tuples = self._merge_binary_tuplelsts_into_ternary_tuplelst(sv_tuples, vo_tuples) + + for item in svo_tuples: + tmp_data.append([text_id, "'" + sent + "'", item[0], item[1], item[2]]) + + text_id += 1 + + # encode categorical data (subject, object, verb strings) into numerical identifiers + # this is required in order to generate network data that is in the format expected + # by the orange-network addon + tmp_data_e, tmp_data_n = self.encode_data(tmp_data) + # create a datafame out of the data + edge_data_tmp = pd.DataFrame(tmp_data, columns = ['story_id', 'sentence', 'subject', 'action', 'object']) + # edge_data_tmp = pd.DataFrame(tmp_data_e, columns = ['story_id', 'sentence_id', 'subject_id', 'action_id', 'object_id']) + node_data_tmp = pd.DataFrame(tmp_data_n, columns = ['label', 'types', 'node_id']) + + # all_data = pd.DataFrame(tmp_data) + # convert the dataframe to orange table format and set outputs of widget + self.Outputs.edge_data.send(table_from_frame(edge_data_tmp)) + self.Outputs.node_data.send(table_from_frame(node_data_tmp)) + # print() + # print("handoff: ", node_data_tmp['label'].tolist()) + # print() + items = node_data_tmp['label'].tolist() + + # Table.from_list(Domain([], metas=[StringVariable('label')]), node_data_tmp['label'].tolist()) + # items = Table.from_list(Domain([]), node_data_tmp['node_id'].tolist()) + shape = (len(node_data_tmp['label'].tolist()), len(node_data_tmp['label'].tolist())) + + row = [] + col = [] + data = [] + for item in tmp_data: + source_index_sa = node_data_tmp['label'].tolist().index(item[2]) + target_index_sa = node_data_tmp['label'].tolist().index(item[3]) + row.append(source_index_sa) + col.append(target_index_sa) + data.append(1.0) + + source_index_ao = node_data_tmp['label'].tolist().index(item[3]) + target_index_ao = node_data_tmp['label'].tolist().index(item[4]) + row.append(source_index_ao) + col.append(target_index_ao) + data.append(1.0) + + row_np = np.array(row) + col_np = np.array(col) + data_np = np.array(data) + edges = sp.csr_matrix((data_np, (row_np, col_np)), shape=shape) + + self.Outputs.network.send(Network(items, edges)) + + +# if __name__ == "__main__": +# from orangewidget.utils.widgetpreview import WidgetPreview + +# from orangecontrib.text.preprocess import BASE_TOKENIZER + +# corpus_ = Corpus.from_file("book-excerpts") +# corpus_ = corpus_[:3] +# corpus_ = BASE_TOKENIZER(corpus_) +# WidgetPreview(OWSNDSGTagger).run(corpus_) \ No newline at end of file diff --git a/orangecontrib/storynavigation/widgets/__init__.py b/orangecontrib/storynavigation/widgets/__init__.py index 63a1283..13bddcb 100755 --- a/orangecontrib/storynavigation/widgets/__init__.py +++ b/orangecontrib/storynavigation/widgets/__init__.py @@ -15,7 +15,7 @@ BACKGROUND = "#C0FF97" -ICON = "icons/Category-NavigatingStories.jpg" +ICON = "icons/Category-NavigatingStories.png" # Location of widget help files. WIDGET_HELP_PATH = ( @@ -28,7 +28,7 @@ # Documentation included in wheel # Correct DATA_FILES entry is needed in setup.py and documentation has to be built # before the wheel is created. - ("{}/help/orange3-network/index.html".format(sysconfig.get_path("data")), None), + ("{}/help/orange3-storynavigator/index.html".format(sysconfig.get_path("data")), None), # Online documentation url, used when the local documentation is available. # Url should point to a page with a section Widgets. This section should diff --git a/orangecontrib/storynavigation/widgets/icons/Category-NavigatingStories.png b/orangecontrib/storynavigation/widgets/icons/Category-NavigatingStories.png new file mode 100644 index 0000000..7fb4264 Binary files /dev/null and b/orangecontrib/storynavigation/widgets/icons/Category-NavigatingStories.png differ diff --git a/orangecontrib/storynavigation/widgets/icons/Category-NavigatingStories.jpg b/orangecontrib/storynavigation/widgets/icons/Category-NavigatingStories_tmp.jpg similarity index 100% rename from orangecontrib/storynavigation/widgets/icons/Category-NavigatingStories.jpg rename to orangecontrib/storynavigation/widgets/icons/Category-NavigatingStories_tmp.jpg diff --git a/orangecontrib/storynavigation/widgets/icons/OWArgExplorer.svg b/orangecontrib/storynavigation/widgets/icons/OWArgExplorer.svg new file mode 100644 index 0000000..8248033 --- /dev/null +++ b/orangecontrib/storynavigation/widgets/icons/OWArgExplorer.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/orangecontrib/storynavigation/widgets/icons/download.html b/orangecontrib/storynavigation/widgets/icons/download.html new file mode 100644 index 0000000..6cf2715 --- /dev/null +++ b/orangecontrib/storynavigation/widgets/icons/download.html @@ -0,0 +1,985 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + EEG Emotion Recognition Using Dynamical Graph Convolutional Neural Networks | IEEE Journals & Magazine | IEEE Xplore + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ +
+
+
+ + + +
+ + + + + + +
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <",c,' onload="var d=',n,";d.getElementsByTagName('head')[0].",d,"(d.",g,"('script')).",i,"='",a.l,"'\">"].join("")}var c="body",e=h[c];if(!e)return setTimeout(q,100);a.P(1);var d="appendChild",g="createElement",i="src",k=h[g]("div"),l=k[d](h[g]("div")),f=h[g]("iframe"),n="document",p;k.style.display="none";e.insertBefore(k,e.firstChild).id=o+"-"+j;f.frameBorder="0";f.id=o+"-frame-"+j;/MSIE[ ]+6/.test(navigator.userAgent)&&(f[i]="javascript:false");f.allowTransparency="true";l[d](f);try{f.contentWindow[n].open()}catch(s){a.domain=h.domain,p="javascript:var d="+n+".open();d.domain='"+h.domain+"';",f[i]=p+"void(0);"}try{var r=f.contentWindow[n];r.write(b());r.close()}catch(t) { f[i]=p+'d.write("'+b().replace(/"/g,String.fromCharCode(92)+'"')+'");d.close();'}a.P(2)}; a.l&&setTimeout(q,0)})()}();c[b].lv="1";return c[b]}var o="lightningjs",k=window[o]=g(o);k.require=g;k.modules=c}({}); if(!navigator.userAgent.match(/Android|BlackBerry|BB10|iPhone|iPad|iPod|Opera Mini|IEMobile/i)) {window.usabilla_live = lightningjs.require("usabilla_live", "//w.usabilla.com/e9930a118e08.js"); } else {window.usabilla_live = lightningjs.require("usabilla_live", "//w.usabilla.com/118ca38ae742.js"); }/*]]>{/literal}*/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ + \ No newline at end of file diff --git a/orangecontrib/storynavigation/widgets/icons/dsg_ruleset_icon.png b/orangecontrib/storynavigation/widgets/icons/dsg_ruleset_icon.png deleted file mode 100644 index c68e0d3..0000000 Binary files a/orangecontrib/storynavigation/widgets/icons/dsg_ruleset_icon.png and /dev/null differ diff --git a/orangecontrib/storynavigation/widgets/icons/dsg_stanzadep_icon.png b/orangecontrib/storynavigation/widgets/icons/dsg_stanzadep_icon.png index 83d2ec1..fa08d8b 100644 Binary files a/orangecontrib/storynavigation/widgets/icons/dsg_stanzadep_icon.png and b/orangecontrib/storynavigation/widgets/icons/dsg_stanzadep_icon.png differ diff --git a/orangecontrib/storynavigation/widgets/icons/dsgtagger.png b/orangecontrib/storynavigation/widgets/icons/dsgtagger.png new file mode 100644 index 0000000..3e3aece Binary files /dev/null and b/orangecontrib/storynavigation/widgets/icons/dsgtagger.png differ diff --git a/orangecontrib/storynavigation/widgets/icons/dsgtagger.svg b/orangecontrib/storynavigation/widgets/icons/dsgtagger.svg deleted file mode 100644 index 236e886..0000000 --- a/orangecontrib/storynavigation/widgets/icons/dsgtagger.svg +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/orangecontrib/storynavigation/widgets/icons/narrative_network_icon.png b/orangecontrib/storynavigation/widgets/icons/narrative_network_icon.png new file mode 100644 index 0000000..c9cba27 Binary files /dev/null and b/orangecontrib/storynavigation/widgets/icons/narrative_network_icon.png differ diff --git a/orangecontrib/storynavigation/widgets/icons/tall-ship.svg b/orangecontrib/storynavigation/widgets/icons/tall-ship.svg new file mode 100644 index 0000000..a7eaf3d --- /dev/null +++ b/orangecontrib/storynavigation/widgets/icons/tall-ship.svg @@ -0,0 +1,165 @@ + + + + + + + + + + + + + + + image/svg+xml + + + + + Openclipart + + + Tall Ship + 2011-01-14T18:39:46 + Silhouette of a tall ship + https://openclipart.org/detail/104821/tall-ship-by-last-dino + + + Last-Dino + + + + + boat + pirate ship + ship + silhouette + tall ship + + + + + + + + + + + diff --git a/requirements.txt b/requirements.txt index f23f0f8..d07bdf1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ PyQt5==5.15.7 PyQtWebEngine==5.15.6 Orange3==3.34.1 Orange3-Text==1.12.0 +Orange3-network==1.7.0 pandas==1.5.3 regex==2022.10.31 stanza==1.4.2 diff --git a/setup.py b/setup.py index 4e83bde..3aa3cb2 100755 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ 'anyqt', 'Orange3>=3.32', 'Orange3-text>=1.7', + 'Orange3-network>=1.7', 'orange-widget-base', 'scikit-learn', ),