From d0a3b94a5fa6a3fb2fb93e5912ddae4a7b9d3646 Mon Sep 17 00:00:00 2001 From: Vincent Simon Date: Thu, 26 Feb 2015 21:10:24 +0100 Subject: [PATCH] Version1 --- ...1guli\303\250res (Regex)-checkpoint.ipynb" | 587 +++++++++++++++ ...ns r\303\251guli\303\250res (Regex).ipynb" | 683 ++++++++++++++++++ 2 files changed, 1270 insertions(+) create mode 100644 "day3/.ipynb_checkpoints/03 - Focus sur les expressions r\303\251guli\303\250res (Regex)-checkpoint.ipynb" create mode 100644 "day3/03 - Focus sur les expressions r\303\251guli\303\250res (Regex).ipynb" diff --git "a/day3/.ipynb_checkpoints/03 - Focus sur les expressions r\303\251guli\303\250res (Regex)-checkpoint.ipynb" "b/day3/.ipynb_checkpoints/03 - Focus sur les expressions r\303\251guli\303\250res (Regex)-checkpoint.ipynb" new file mode 100644 index 0000000..fe7a080 --- /dev/null +++ "b/day3/.ipynb_checkpoints/03 - Focus sur les expressions r\303\251guli\303\250res (Regex)-checkpoint.ipynb" @@ -0,0 +1,587 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:e03de3b803149d3aa073adb0234b1e547fe4a78754bd2d1f1ad55fe0f6a9af05" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Les expressions r\u00e9guli\u00e8res" + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ce focus va vous permettre de toucher du doigt la notion d'expressions r\u00e9guli\u00e8res, cependant il serait trop long de tout voir en d\u00e9tail.
\n", + "\n", + "Pour ceux qui veulent en savoir plus : [Learn Regex The Hard Way](http://regex.learncodethehardway.org/book/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Les expressions r\u00e9guli\u00e8res sont des patterns que l'on cr\u00e9e afin de soit :
\n", + "- selectionner l'information
\n", + "- diviser l'information
\n", + "- effectuer une validation
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Afin de tester et de cr\u00e9er son pattern, il est fortement conseiller d'utiliser un testeur d'expressions r\u00e9guli\u00e8res tel que [RegexR](http://www.regexr.com/)" + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Regex sur iPython et exemple" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Afin de pouvoir utiliser les expressions reguli\u00e8res il faut importer le module ```re``` " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "\n", + "import pandas as pd\n", + "import re" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 14 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vous avez un DataFrame compos\u00e9 de noms et d'adresses emails, vous souhaiteriez n'avoir que les adresse emails." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df = pd.DataFrame(['Martin Daniel',\n", + " 'martin@gmail.com', \n", + " 'Vincent Simon', \n", + " 'vincent.simon@laposte.net', \n", + " 'Bob', \n", + " 'bobby@zimmerman.com'],columns=['noms&mails'])\n", + "df" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
noms&mails
0 Martin Daniel
1 martin@gmail.com
2 Vincent Simon
3 vincent.simon@laposte.net
4 Bob
5 bobby@zimmerman.com
\n", + "

6 rows \u00d7 1 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 40, + "text": [ + " noms&mails\n", + "0 Martin Daniel\n", + "1 martin@gmail.com\n", + "2 Vincent Simon\n", + "3 vincent.simon@laposte.net\n", + "4 Bob\n", + "5 bobby@zimmerman.com\n", + "\n", + "[6 rows x 1 columns]" + ] + } + ], + "prompt_number": 40 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Ici nous allons d\u00e9finir le pattern de l'expression r\u00e9guli\u00e8re que nous allons utiliser.\n", + "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'\n", + "\n", + "# re.IGNORECASE permet d'ignorer la casse. La fonction .match() permet de renvoyer un bool\u00e9en True or False.\n", + "resultat = df['noms&mails'].str.match(pattern, re.IGNORECASE)\n", + "print resultat" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 False\n", + "1 True\n", + "2 False\n", + "3 True\n", + "4 False\n", + "5 True\n", + "Name: noms&mails, dtype: bool\n" + ] + } + ], + "prompt_number": 43 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Construire une expression r\u00e9guli\u00e8re" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construisons un DataFrame compos\u00e9 de differents types de donn\u00e9es et regardons les diff\u00e9rents r\u00e9sultats :" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df2 = pd.DataFrame(['Martin Daniel',\n", + " 'martin@gmail.com', \n", + " '1234', \n", + " '0637687898', \n", + " 'Bob'],columns=['donn\u00e9es'])\n", + "df2" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
donn\u00e9es
0 Martin Daniel
1 martin@gmail.com
2 1234
3 0637687898
4 Bob
\n", + "

5 rows \u00d7 1 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 45, + "text": [ + " donn\u00e9es\n", + "0 Martin Daniel\n", + "1 martin@gmail.com\n", + "2 1234\n", + "3 0637687898\n", + "4 Bob\n", + "\n", + "[5 rows x 1 columns]" + ] + } + ], + "prompt_number": 45 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "1er pattern : (\\w)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern1 = r'\\w'\n", + "\n", + "resultat1 = df2['donn\u00e9es'].str.match(pattern1, re.IGNORECASE)\n", + "print resultat1" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 True\n", + "1 True\n", + "2 True\n", + "3 True\n", + "4 True\n", + "Name: donn\u00e9es, dtype: bool\n" + ] + } + ], + "prompt_number": 58 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notre pattern1 s\u00e9l\u00e9ctionne tous les caract\u00e8res sauf les caract\u00e8res sp\u00e9ciaux." + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "2eme pattern : \\d" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern2 = r'\\d'\n", + "\n", + "resultat2 = df2['donn\u00e9es'].str.match(pattern2, re.IGNORECASE)\n", + "print resultat2" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 False\n", + "1 False\n", + "2 True\n", + "3 True\n", + "4 False\n", + "Name: donn\u00e9es, dtype: bool\n" + ] + } + ], + "prompt_number": 60 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notre pattern2 n'a pris en compte que les donn\u00e9es num\u00e9riques." + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "3eme pattern : \\D" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern3 = r'\\D'\n", + "\n", + "resultat3 = df2['donn\u00e9es'].str.match(pattern3, re.IGNORECASE)\n", + "print resultat3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 True\n", + "1 True\n", + "2 False\n", + "3 False\n", + "4 True\n", + "Name: donn\u00e9es, dtype: bool\n" + ] + } + ], + "prompt_number": 62 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notre pattern3 a pris compte toutes les donn\u00e9es **n'\u00e9tant pas** num\u00e9riques" + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "4eme pattern : 06\\d{8}" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern4 = r'06\\d{8}'\n", + "\n", + "resultat4 = df2['donn\u00e9es'].str.match(pattern4, re.IGNORECASE)\n", + "print resultat4" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 True\n", + "4 False\n", + "Name: donn\u00e9es, dtype: bool\n" + ] + } + ], + "prompt_number": 69 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notre pattern4 a pris en compte le seul num\u00e9ro de t\u00e9l\u00e9phone portable." + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Les fonctions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Jusqu'ici nous avons utiliser la fonction .match() qui renvoie un bool\u00e9en ```True``` ou ```False```.
\n", + "Il y a plusieurs autres fonctions tr\u00e8s utiles :" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Reprenons notre DataFrame initial :\n", + "df3 = pd.DataFrame(['Martin Daniel',\n", + " 'martin@gmail.com', \n", + " 'Vincent Simon', \n", + " 'vincent.simon@laposte.net', \n", + " 'Bob', \n", + " 'bobby@zimmerman.com, bobby@zimm.fr'],columns=['noms&mails'])\n", + "df3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
noms&mails
0 Martin Daniel
1 martin@gmail.com
2 Vincent Simon
3 vincent.simon@laposte.net
4 Bob
5 bobby@zimmerman.com
\n", + "

6 rows \u00d7 1 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 70, + "text": [ + " noms&mails\n", + "0 Martin Daniel\n", + "1 martin@gmail.com\n", + "2 Vincent Simon\n", + "3 vincent.simon@laposte.net\n", + "4 Bob\n", + "5 bobby@zimmerman.com\n", + "\n", + "[6 rows x 1 columns]" + ] + } + ], + "prompt_number": 70 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 72 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + ".count()" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "resultatcount = df['noms&mails'].str.count(pattern, re.IGNORECASE)\n", + "print resultatcount" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 0\n", + "1 1\n", + "2 0\n", + "3 1\n", + "4 0\n", + "5 1\n", + "Name: noms&mails, dtype: int64\n" + ] + } + ], + "prompt_number": 73 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "La m\u00e9thode .count() compte le nombre d'occurences du pattern" + ] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git "a/day3/03 - Focus sur les expressions r\303\251guli\303\250res (Regex).ipynb" "b/day3/03 - Focus sur les expressions r\303\251guli\303\250res (Regex).ipynb" new file mode 100644 index 0000000..dee9a15 --- /dev/null +++ "b/day3/03 - Focus sur les expressions r\303\251guli\303\250res (Regex).ipynb" @@ -0,0 +1,683 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:83d5340cfc6187353dd41abead5a22c1c117ec3bf55e5a3d46574a2d709ed91d" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Les expressions r\u00e9guli\u00e8res" + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ce focus va vous permettre de toucher du doigt la notion d'expressions r\u00e9guli\u00e8res, cependant il serait trop long de tout voir en d\u00e9tail.
\n", + "\n", + "Pour ceux qui veulent en savoir plus : [Learn Regex The Hard Way](http://regex.learncodethehardway.org/book/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Les expressions r\u00e9guli\u00e8res sont des patterns que l'on cr\u00e9e afin de soit :
\n", + "- selectionner l'information
\n", + "- diviser l'information
\n", + "- effectuer une validation
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Afin de tester et de cr\u00e9er son pattern, il est fortement conseiller d'utiliser un testeur d'expressions r\u00e9guli\u00e8res tel que [RegexR](http://www.regexr.com/)" + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Regex sur iPython et exemple" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Afin de pouvoir utiliser les expressions reguli\u00e8res il faut importer le module ```re``` " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "\n", + "import pandas as pd\n", + "import re" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 14 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vous avez un DataFrame compos\u00e9 de noms et d'adresses emails, vous souhaiteriez n'avoir que les adresse emails." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df = pd.DataFrame(['Martin Daniel',\n", + " 'martin@gmail.com', \n", + " 'Vincent Simon', \n", + " 'vincent.simon@laposte.net', \n", + " 'Bob', \n", + " 'bobby@zimmerman.com'],columns=['noms&mails'])\n", + "df" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
noms&mails
0 Martin Daniel
1 martin@gmail.com
2 Vincent Simon
3 vincent.simon@laposte.net
4 Bob
5 bobby@zimmerman.com
\n", + "

6 rows \u00d7 1 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 40, + "text": [ + " noms&mails\n", + "0 Martin Daniel\n", + "1 martin@gmail.com\n", + "2 Vincent Simon\n", + "3 vincent.simon@laposte.net\n", + "4 Bob\n", + "5 bobby@zimmerman.com\n", + "\n", + "[6 rows x 1 columns]" + ] + } + ], + "prompt_number": 40 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Ici nous allons d\u00e9finir le pattern de l'expression r\u00e9guli\u00e8re que nous allons utiliser.\n", + "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'\n", + "\n", + "# re.IGNORECASE permet d'ignorer la casse. La fonction .match() permet de renvoyer un bool\u00e9en True or False.\n", + "resultat = df['noms&mails'].str.match(pattern, re.IGNORECASE)\n", + "print resultat" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 False\n", + "1 True\n", + "2 False\n", + "3 True\n", + "4 False\n", + "5 True\n", + "Name: noms&mails, dtype: bool\n" + ] + } + ], + "prompt_number": 43 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Construire une expression r\u00e9guli\u00e8re" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construisons un DataFrame compos\u00e9 de differents types de donn\u00e9es et regardons les diff\u00e9rents r\u00e9sultats :" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df2 = pd.DataFrame(['Martin Daniel',\n", + " 'martin@gmail.com', \n", + " '1234', \n", + " '0637687898', \n", + " 'Bob'],columns=['donn\u00e9es'])\n", + "df2" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
donn\u00e9es
0 Martin Daniel
1 martin@gmail.com
2 1234
3 0637687898
4 Bob
\n", + "

5 rows \u00d7 1 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 45, + "text": [ + " donn\u00e9es\n", + "0 Martin Daniel\n", + "1 martin@gmail.com\n", + "2 1234\n", + "3 0637687898\n", + "4 Bob\n", + "\n", + "[5 rows x 1 columns]" + ] + } + ], + "prompt_number": 45 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "1er pattern : (\\w)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern1 = r'\\w'\n", + "\n", + "resultat1 = df2['donn\u00e9es'].str.match(pattern1, re.IGNORECASE)\n", + "print resultat1" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 True\n", + "1 True\n", + "2 True\n", + "3 True\n", + "4 True\n", + "Name: donn\u00e9es, dtype: bool\n" + ] + } + ], + "prompt_number": 58 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notre pattern1 s\u00e9l\u00e9ctionne tous les caract\u00e8res sauf les caract\u00e8res sp\u00e9ciaux." + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "2eme pattern : \\d" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern2 = r'\\d'\n", + "\n", + "resultat2 = df2['donn\u00e9es'].str.match(pattern2, re.IGNORECASE)\n", + "print resultat2" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 False\n", + "1 False\n", + "2 True\n", + "3 True\n", + "4 False\n", + "Name: donn\u00e9es, dtype: bool\n" + ] + } + ], + "prompt_number": 60 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notre pattern2 n'a pris en compte que les donn\u00e9es num\u00e9riques." + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "3eme pattern : \\D" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern3 = r'\\D'\n", + "\n", + "resultat3 = df2['donn\u00e9es'].str.match(pattern3, re.IGNORECASE)\n", + "print resultat3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 True\n", + "1 True\n", + "2 False\n", + "3 False\n", + "4 True\n", + "Name: donn\u00e9es, dtype: bool\n" + ] + } + ], + "prompt_number": 62 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notre pattern3 a pris compte toutes les donn\u00e9es **n'\u00e9tant pas** num\u00e9riques" + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "4eme pattern : 06\\d{8}" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern4 = r'06\\d{8}'\n", + "\n", + "resultat4 = df2['donn\u00e9es'].str.match(pattern4, re.IGNORECASE)\n", + "print resultat4" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 True\n", + "4 False\n", + "Name: donn\u00e9es, dtype: bool\n" + ] + } + ], + "prompt_number": 69 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notre pattern4 a pris en compte le seul num\u00e9ro de t\u00e9l\u00e9phone portable." + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Les fonctions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Jusqu'ici nous avons utiliser la fonction .match() qui renvoie un bool\u00e9en ```True``` ou ```False```.
\n", + "Il y a plusieurs autres fonctions tr\u00e8s utiles :" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Reprenons notre DataFrame initial en ajoutant une deuxi\u00e8me adresse \u00e0 Bobby :\n", + "df3 = pd.DataFrame(['Martin Daniel',\n", + " 'martin@gmail.com', \n", + " 'Vincent Simon', \n", + " 'vincent.simon@laposte.net', \n", + " 'Bob', \n", + " 'bobby@zimmerman.com, bobby@zimm.fr'],columns=['noms&mails'])\n", + "df3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
noms&mails
0 Martin Daniel
1 martin@gmail.com
2 Vincent Simon
3 vincent.simon@laposte.net
4 Bob
5 bobby@zimmerman.com, bobby@zimm.fr
\n", + "

6 rows \u00d7 1 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 75, + "text": [ + " noms&mails\n", + "0 Martin Daniel\n", + "1 martin@gmail.com\n", + "2 Vincent Simon\n", + "3 vincent.simon@laposte.net\n", + "4 Bob\n", + "5 bobby@zimmerman.com, bobby@zimm.fr\n", + "\n", + "[6 rows x 1 columns]" + ] + } + ], + "prompt_number": 75 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 72 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + ".count()" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "resultatcount = df3['noms&mails'].str.count(pattern, re.IGNORECASE)\n", + "print resultatcount" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 0\n", + "1 1\n", + "2 0\n", + "3 1\n", + "4 0\n", + "5 2\n", + "Name: noms&mails, dtype: int64\n" + ] + } + ], + "prompt_number": 78 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "La m\u00e9thode .count() compte le nombre d'occurences du pattern." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "resultatfindall = df3['noms&mails'].str.findall(pattern, re.IGNORECASE)\n", + "print resultatfindall" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0 []\n", + "1 [martin@gmail.com]\n", + "2 []\n", + "3 [vincent.simon@laposte.net]\n", + "4 []\n", + "5 [bobby@zimmerman.com, bobby@zimm.fr]\n", + "Name: noms&mails, dtype: object\n" + ] + } + ], + "prompt_number": 95 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "La m\u00e9thode .findall() permet d'afficher les r\u00e9sultats li\u00e9s au pattern.
" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Pour afficher ces r\u00e9sultat de fa\u00e7on plus propre, \n", + "# nous pouvons utliser la methode .str[0] pour avoir la premi\u00e8re colonne des r\u00e9sultats\n", + "resultatfindall.str[0]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 96, + "text": [ + "0 NaN\n", + "1 martin@gmail.com\n", + "2 NaN\n", + "3 vincent.simon@laposte.net\n", + "4 NaN\n", + "5 bobby@zimmerman.com\n", + "Name: noms&mails, dtype: object" + ] + } + ], + "prompt_number": 96 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Pour avoir la deuxi\u00e8me colonne nous jouons sur l'index\n", + "resultatfindall.str[1]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 97, + "text": [ + "0 NaN\n", + "1 NaN\n", + "2 NaN\n", + "3 NaN\n", + "4 NaN\n", + "5 bobby@zimm.fr\n", + "Name: noms&mails, dtype: object" + ] + } + ], + "prompt_number": 97 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file