Skip to content

Commit

Permalink
Even fastest cyk
Browse files Browse the repository at this point in the history
  • Loading branch information
Chaoukia committed Mar 18, 2019
1 parent 5988b07 commit 033ab0d
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 98 deletions.
130 changes: 81 additions & 49 deletions .ipynb_checkpoints/parser-Copy1-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,56 @@
"execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"# from time import time\n",
"\n",
"# index_non_terminal = lambda index : dict_indices_non_terminals[index]\n",
"\n",
"# def cyk(sentence):\n",
" \n",
"# n = len(sentence) + 1\n",
"# k = len(non_terminals)\n",
"# scores = [[{} for j in range(n)] for l in range(n)]\n",
"# back = [[[None for i in range(k)] for j in range(n)] for l in range(n)]\n",
"# couples = []\n",
"# for i in range(0, n-1):\n",
"# word = sentence[i]\n",
"# for A in dict_probas['lexical'][word]:\n",
"# scores[i][i+1][A] = dict_probas['lexical'][word][A]\n",
" \n",
"# time_binary = 0\n",
"# for span in range(2, n):\n",
"# start_time_binary = time()\n",
"# for begin in range(0, n-span):\n",
"# end = begin + span\n",
"# start_time_binary = time()\n",
"# for split in range(begin+1, end):\n",
"# Bs = set(scores[begin][split].keys()).intersection(B_binary)\n",
"# Cs = set(scores[split][end].keys()).intersection(C_binary)\n",
" \n",
"# for B, C in set_binary.intersection(set(product(Bs, Cs))):\n",
" \n",
"# score_B, score_C = scores[begin][split].get(B, -np.inf), scores[split][end].get(C, -np.inf)\n",
"# for A in dict_probas['binary'][(B, C)]:\n",
"# couples.append((A, B, C))\n",
"# prob = score_B + score_C + dict_probas['binary'][(B, C)][A]\n",
"# if prob > scores[begin][end].get(A, -np.inf):\n",
"# scores[begin][end][A] = prob\n",
"# back[begin][end][dict_non_terminals_indices[A]] = (split, B, C) \n",
" \n",
"# time_binary += time() - start_time_binary\n",
" \n",
"# print('binary time %.6f' %time_binary)\n",
"# print(len(couples))\n",
" \n",
"# return scores, back"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [],
"source": [
"from time import time\n",
"\n",
Expand All @@ -453,36 +503,40 @@
" k = len(non_terminals)\n",
" scores = [[{} for j in range(n)] for l in range(n)]\n",
" back = [[[None for i in range(k)] for j in range(n)] for l in range(n)]\n",
" couples = []\n",
" Bs_scores, Cs_scores = [[set() for j in range(n)] for l in range(n)], [[set() for j in range(n)] for l in range(n)]\n",
" for i in range(0, n-1):\n",
" word = sentence[i]\n",
" for A in dict_probas['lexical'][word]:\n",
" scores[i][i+1][A] = dict_probas['lexical'][word][A]\n",
" if A in B_binary:\n",
" Bs_scores[i][i+1].add(A)\n",
" \n",
" if A in C_binary:\n",
" Cs_scores[i][i+1].add(A)\n",
" \n",
" time_binary = 0\n",
" for span in range(2, n):\n",
" start_time_binary = time()\n",
" for begin in range(0, n-span):\n",
" end = begin + span\n",
" start_time_binary = time()\n",
" for split in range(begin+1, end):\n",
" Bs = set(scores[begin][split].keys()).intersection(B_binary)\n",
" Cs = set(scores[split][end].keys()).intersection(C_binary)\n",
"# Bs = set(scores[begin][split].keys()).intersection(B_binary)\n",
"# Cs = set(scores[split][end].keys()).intersection(C_binary)\n",
" Bs = Bs_scores[begin][split].intersection(B_binary)\n",
" Cs = Cs_scores[split][end].intersection(C_binary)\n",
" \n",
" for B, C in set_binary.intersection(set(product(Bs, Cs))):\n",
" \n",
" score_B, score_C = scores[begin][split].get(B, -np.inf), scores[split][end].get(C, -np.inf)\n",
" for A in dict_probas['binary'][(B, C)]:\n",
" couples.append((A, B, C))\n",
" prob = score_B + score_C + dict_probas['binary'][(B, C)][A]\n",
" if prob > scores[begin][end].get(A, -np.inf):\n",
" scores[begin][end][A] = prob\n",
" back[begin][end][dict_non_terminals_indices[A]] = (split, B, C) \n",
" \n",
" time_binary += time() - start_time_binary\n",
" \n",
" print('binary time %.6f' %time_binary)\n",
" print(len(couples))\n",
" back[begin][end][dict_non_terminals_indices[A]] = (split, B, C)\n",
" if A in B_binary:\n",
" Bs_scores[begin][end].add(A)\n",
"\n",
" if A in C_binary:\n",
" Cs_scores[begin][end].add(A)\n",
" \n",
" return scores, back"
]
Expand All @@ -504,26 +558,6 @@
"%timeit np.where(np.array([0, 3, 6, 9, 0, 1, 4, 7, 5, 6, 3, 0]) == 0)[0]"
]
},
{
"cell_type": "code",
"execution_count": 238,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2767"
]
},
"execution_count": 238,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dict_probas['binary'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -533,7 +567,7 @@
},
{
"cell_type": "code",
"execution_count": 165,
"execution_count": 183,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -608,7 +642,7 @@
},
{
"cell_type": "code",
"execution_count": 166,
"execution_count": 184,
"metadata": {},
"outputs": [
{
Expand All @@ -618,7 +652,7 @@
"Tree('+SENT', [Tree('NP', [Tree('DET', ['Une']), Tree('NP|<NC-COORD>', [Tree('NC', ['heure']), Tree('COORD', [Tree('CC', ['et']), Tree('NP', [Tree('DET', ['vingt']), Tree('NC', ['minutes'])])])])]), Tree('SENT|<PONCT-VN>', [Tree('PONCT', [',']), Tree('SENT|<VN-NP>', [Tree('VN', [Tree('CLS', [\"c'\"]), Tree('V', ['est'])]), Tree('SENT|<NP-PONCT>', [Tree('NP', [Tree('DET', ['le']), Tree('NP|<NC-Srel>', [Tree('NC', ['temps']), Tree('Srel', [Tree('NP+PROREL', [\"qu'\"]), Tree('Srel|<VN-PP>', [Tree('VN', [Tree('CLS', ['il']), Tree('VN|<V-VPP>', [Tree('V', ['aura']), Tree('VPP', ['fallu'])])]), Tree('Srel|<PP-PONCT>', [Tree('PP', [Tree('P', ['à']), Tree('NP', [Tree('NPP', ['Thierry']), Tree('NP|<NPP-PONCT>', [Tree('NPP', ['Guerry']), Tree('NP|<PONCT-NP>', [Tree('PONCT', [',']), Tree('NP', [Tree('NC', ['chauffeur-routier']), Tree('PP', [Tree('P', ['chez']), Tree('NP', [Tree('NPP', ['Caillaud']), Tree('NP|<PONCT-NP>', [Tree('PONCT', [',']), Tree('NP', [Tree('DET', [\"l'\"]), Tree('NP|<NC-PP>', [Tree('NC', ['entreprise']), Tree('NP|<PP-VPpart>', [Tree('PP', [Tree('P', ['en']), Tree('NP+NC', ['charpente'])]), Tree('VPpart', [Tree('VPP', ['chargée']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NP|<NC-PP>', [Tree('NC', ['pose']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NP|<NC-PP>', [Tree('NC', ['toiture']), Tree('PP', [Tree('P+D', ['des']), Tree('NP', [Tree('NC', ['cours']), Tree('NP|<PP-AP>', [Tree('PP', [Tree('P', ['de']), Tree('NP+NC', ['tennis'])]), Tree('AP+ADJ', ['couverts'])])])])])])])])])])])])])])])])])])])])])]), Tree('Srel|<PONCT-VPinf>', [Tree('PONCT', [',']), Tree('VPinf', [Tree('P', ['pour']), Tree('VPinf|<VN-PP>', [Tree('VN+VINF', ['pénétrer']), Tree('VPinf|<PP-PONCT>', [Tree('PP', [Tree('P', ['dans']), Tree('NP', [Tree('DET', [\"l'\"]), Tree('NP|<NC-PP>', [Tree('NC', ['enceinte']), Tree('PP', [Tree('P+D', ['du']), Tree('NP', [Tree('NC', ['complexe']), Tree('NP|<AP-PP>', [Tree('AP+ADJ', ['sportif']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['commune'])])])])])])])])]), Tree('VPinf|<PONCT-PP>', [Tree('PONCT', [',']), Tree('PP', [Tree('P', ['avec']), Tree('NP', [Tree('DET', ['son']), Tree('NP|<NC-PONCT>', [Tree('NC', ['semi-remorque']), Tree('NP|<PONCT-Srel>', [Tree('PONCT', [',']), Tree('Srel', [Tree('PP', [Tree('P', ['sur']), Tree('NP+PROREL', ['lequel'])]), Tree('Srel|<VN-NP>', [Tree('VN', [Tree('V', ['étaient']), Tree('VPP', ['chargées'])]), Tree('NP', [Tree('DET', ['quatorze']), Tree('NP|<NC-PP>', [Tree('NC', ['tonnes']), Tree('PP', [Tree('P', [\"d'\"]), Tree('NP', [Tree('NC', ['éléments']), Tree('NP|<PP-PP>', [Tree('PP', [Tree('P', ['en']), Tree('NP+NC', ['bois'])]), Tree('NP|<PP-PONCT>', [Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['trente']), Tree('NP|<NC-PP>', [Tree('NC', ['mètres']), Tree('PP', [Tree('P', ['de']), Tree('NP+ADJ', ['long'])])])])]), Tree('NP|<PONCT-VPpart>', [Tree('PONCT', [',']), Tree('VPpart', [Tree('VPP', ['destinés']), Tree('PP', [Tree('P', ['à']), Tree('NP', [Tree('DET', ['la']), Tree('NP|<NC-PP>', [Tree('NC', ['couverture']), Tree('PP', [Tree('P+D', ['du']), Tree('NP', [Tree('NC', ['bâtiment']), Tree('PP', [Tree('P', ['en']), Tree('NP', [Tree('NC', ['construction']), Tree('PP', [Tree('P', ['sur']), Tree('NP', [Tree('DET', ['le']), Tree('NC', ['stade'])])])])])])])])])])])])])])])])])])])])])])])])])])])])])])])])])]), Tree('PONCT', ['.'])])])])])"
]
},
"execution_count": 166,
"execution_count": 184,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -632,7 +666,7 @@
},
{
"cell_type": "code",
"execution_count": 167,
"execution_count": 185,
"metadata": {},
"outputs": [
{
Expand All @@ -641,7 +675,7 @@
"83"
]
},
"execution_count": 167,
"execution_count": 185,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -654,16 +688,14 @@
},
{
"cell_type": "code",
"execution_count": 168,
"execution_count": 192,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"binary time 13.887385\n",
"5691533\n",
"14.423448085784912\n"
"11.762866735458374\n"
]
}
],
Expand All @@ -676,7 +708,7 @@
},
{
"cell_type": "code",
"execution_count": 169,
"execution_count": 187,
"metadata": {},
"outputs": [
{
Expand All @@ -686,7 +718,7 @@
"Tree('', [Tree('SENT', [Tree('NP', [Tree('DET', ['Une']), Tree('NC', ['heure']), Tree('COORD', [Tree('CC', ['et']), Tree('NP', [Tree('DET', ['vingt']), Tree('NC', ['minutes'])])])]), Tree('PONCT', [',']), Tree('VN', [Tree('CLS', [\"c'\"]), Tree('V', ['est'])]), Tree('NP', [Tree('DET', ['le']), Tree('NC', ['temps']), Tree('Srel', [Tree('NP', [Tree('PROREL', [\"qu'\"])]), Tree('VN', [Tree('CLS', ['il']), Tree('V', ['aura']), Tree('VPP', ['fallu'])]), Tree('PP', [Tree('P', ['à']), Tree('NP', [Tree('NPP', ['Thierry']), Tree('NPP', ['Guerry']), Tree('PONCT', [',']), Tree('NP', [Tree('NC', ['chauffeur-routier']), Tree('PP', [Tree('P', ['chez']), Tree('NP', [Tree('NPP', ['Caillaud']), Tree('PONCT', [',']), Tree('NP', [Tree('DET', [\"l'\"]), Tree('NC', ['entreprise']), Tree('PP', [Tree('P', ['en']), Tree('NP', [Tree('NC', ['charpente'])])]), Tree('VPpart', [Tree('VPP', ['chargée']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['pose']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['toiture']), Tree('PP', [Tree('P', [Tree('D', ['des'])]), Tree('NP', [Tree('NC', ['cours']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('NC', ['tennis'])])]), Tree('AP', [Tree('ADJ', ['couverts'])])])])])])])])])])])])])])]), Tree('PONCT', [',']), Tree('VPinf', [Tree('P', ['pour']), Tree('VN', [Tree('VINF', ['pénétrer'])]), Tree('PP', [Tree('P', ['dans']), Tree('NP', [Tree('DET', [\"l'\"]), Tree('NC', ['enceinte']), Tree('PP', [Tree('P', [Tree('D', ['du'])]), Tree('NP', [Tree('NC', ['complexe']), Tree('AP', [Tree('ADJ', ['sportif'])]), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['commune'])])])])])])]), Tree('PONCT', [',']), Tree('PP', [Tree('P', ['avec']), Tree('NP', [Tree('DET', ['son']), Tree('NC', ['semi-remorque']), Tree('PONCT', [',']), Tree('Srel', [Tree('PP', [Tree('P', ['sur']), Tree('NP', [Tree('PROREL', ['lequel'])])]), Tree('VN', [Tree('V', ['étaient']), Tree('VPP', ['chargées'])]), Tree('NP', [Tree('DET', ['quatorze']), Tree('NC', ['tonnes']), Tree('PP', [Tree('P', [\"d'\"]), Tree('NP', [Tree('NC', ['éléments']), Tree('PP', [Tree('P', ['en']), Tree('NP', [Tree('NC', ['bois'])])]), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['trente']), Tree('NC', ['mètres']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('ADJ', ['long'])])])])]), Tree('PONCT', [',']), Tree('VPpart', [Tree('VPP', ['destinés']), Tree('PP', [Tree('P', ['à']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['couverture']), Tree('PP', [Tree('P', [Tree('D', ['du'])]), Tree('NP', [Tree('NC', ['bâtiment']), Tree('PP', [Tree('P', ['en']), Tree('NP', [Tree('NC', ['construction']), Tree('PP', [Tree('P', ['sur']), Tree('NP', [Tree('DET', ['le']), Tree('NC', ['stade'])])])])])])])])])])])])])])])])])])]), Tree('PONCT', ['.'])])])"
]
},
"execution_count": 169,
"execution_count": 187,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -698,7 +730,7 @@
},
{
"cell_type": "code",
"execution_count": 170,
"execution_count": 188,
"metadata": {},
"outputs": [
{
Expand All @@ -715,7 +747,7 @@
"Tree('', [Tree('SENT', [Tree('NP', [Tree('DET', ['Une']), Tree('NC', ['heure']), Tree('COORD', [Tree('CC', ['et']), Tree('NP', [Tree('DET', ['vingt']), Tree('NC', ['minutes'])])])]), Tree('PONCT', [',']), Tree('VN', [Tree('CLS', [\"c'\"]), Tree('V', ['est'])]), Tree('NP', [Tree('DET', ['le']), Tree('NC', ['temps']), Tree('Srel', [Tree('NP', [Tree('PROREL', [\"qu'\"])]), Tree('VN', [Tree('CLS', ['il']), Tree('V', ['aura']), Tree('VPP', ['fallu'])]), Tree('PP', [Tree('P', ['à']), Tree('NP', [Tree('NPP', ['Thierry']), Tree('NPP', ['Guerry'])])])])]), Tree('PONCT', [',']), Tree('NP', [Tree('NC', ['chauffeur-routier']), Tree('PP', [Tree('P', ['chez']), Tree('NP', [Tree('NPP', ['Caillaud']), Tree('PONCT', [',']), Tree('NP', [Tree('DET', [\"l'\"]), Tree('NC', ['entreprise']), Tree('PP', [Tree('P', ['en']), Tree('NP', [Tree('NC', ['charpente'])])]), Tree('VPpart', [Tree('VPP', ['chargée']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['pose']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['toiture']), Tree('PP', [Tree('P', [Tree('D', ['des'])]), Tree('NP', [Tree('NC', ['cours'])])]), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('NC', ['tennis'])])])])]), Tree('AP', [Tree('ADJ', ['couverts'])])])])])]), Tree('PONCT', [','])])]), Tree('VPinf', [Tree('P', ['pour']), Tree('VN', [Tree('VINF', ['pénétrer'])]), Tree('PP', [Tree('P', ['dans']), Tree('NP', [Tree('DET', [\"l'\"]), Tree('NC', ['enceinte']), Tree('PP', [Tree('P', [Tree('D', ['du'])]), Tree('NP', [Tree('NC', ['complexe']), Tree('AP', [Tree('ADJ', ['sportif'])]), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['commune'])])])])])])]), Tree('PONCT', [',']), Tree('PP', [Tree('P', ['avec']), Tree('NP', [Tree('DET', ['son']), Tree('NC', ['semi-remorque'])])]), Tree('PONCT', [',']), Tree('PP', [Tree('P', ['sur']), Tree('NP', [Tree('PROREL', ['lequel'])])])])]), Tree('VN', [Tree('V', ['étaient']), Tree('VPP', ['chargées'])]), Tree('NP', [Tree('DET', ['quatorze']), Tree('NC', ['tonnes']), Tree('PP', [Tree('P', [\"d'\"]), Tree('NP', [Tree('NC', ['éléments']), Tree('PP', [Tree('P', ['en']), Tree('NP', [Tree('NC', ['bois'])])])])])]), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('DET', ['trente']), Tree('NC', ['mètres']), Tree('PP', [Tree('P', ['de']), Tree('NP', [Tree('ADJ', ['long'])])])])]), Tree('PONCT', [',']), Tree('VPpart', [Tree('VPP', ['destinés']), Tree('PP', [Tree('P', ['à']), Tree('NP', [Tree('DET', ['la']), Tree('NC', ['couverture']), Tree('PP', [Tree('P', [Tree('D', ['du'])]), Tree('NP', [Tree('NC', ['bâtiment'])])]), Tree('PP', [Tree('P', ['en']), Tree('NP', [Tree('NC', ['construction'])])])])]), Tree('PP', [Tree('P', ['sur']), Tree('NP', [Tree('DET', ['le']), Tree('NC', ['stade'])])])]), Tree('PONCT', ['.'])])])"
]
},
"execution_count": 170,
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -728,7 +760,7 @@
},
{
"cell_type": "code",
"execution_count": 171,
"execution_count": 189,
"metadata": {},
"outputs": [
{
Expand All @@ -737,7 +769,7 @@
"False"
]
},
"execution_count": 171,
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -755,7 +787,7 @@
},
{
"cell_type": "code",
"execution_count": 172,
"execution_count": 190,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -804,7 +836,7 @@
},
{
"cell_type": "code",
"execution_count": 173,
"execution_count": 191,
"metadata": {},
"outputs": [
{
Expand Down
Loading

0 comments on commit 033ab0d

Please sign in to comment.