10xac · Fisseha-Estifanos · Aug 8, 2022 · Aug 8, 2022 · Aug 8, 2022 · Aug 8, 2022
diff --git a/.dvc/.gitignore b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
diff --git a/.dvcignore b/.dvcignore
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -1,29 +1,29 @@
-name: twitter-data-analysis
-
-on:
-  push:
-    branches: [main]
-  pull_request:
-    branches: [main]
-
-permissions:
-  contents: read
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install pytest
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-      - name: Test with pytest
-        run: |
-          python -m pytest
+name: twitter-data-analysis
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        run: |
+          python -m pytest
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
-__pycache__/
-data/
+__pycache__/
+data/
+.ipynb_checkpoints
diff --git a/LICENSE b/LICENSE
@@ -1,21 +1,21 @@
-MIT License
-
-Copyright (c) 2022 10 Academy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+MIT License
+
+Copyright (c) 2022 10 Academy
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,29 +1,29 @@
-# Twitter-Data-Analysis
-
-### So here are the bare minimum requirement for completing this task
-
-1. Use this template to create a repository called Twitter-Data-Analysis in your github account. See ["Creating a repository from a template."](https://docs.github.com/en/articles/creating-a-repository-from-a-template) for more information.
-2. [Download](https://drive.google.com/drive/folders/19G8dmehf9vU0u6VTKGV-yWsQOn3IvPsd) and extract the necessary data and put it in the data directory. The data should not not be added to git tracking.
-3. Create a branch called “bugfix” to fix the bugs in the fix_clean_tweets_dataframe.py and fix_extract_dataframe.py 
-4. In branch “bugfix” use the git mv command to rename fix_clean_tweets_dataframe.py to clean_tweets_dataframe.py and fix_extract_dataframe.py  to extract_dataframe.py 
-5. Fix the bugs on clean_tweets_dataframe.py and extract_dataframe.py 
-6. Multiple times, push the code you are working on to git, and once the fix is complete, merge the fix_bug branch to main branch
-7. Create a new branch called “testing” for updating the unit tests in the test/ folder to be applicable to the code you fixed. 
-    a. Build your unit and integration tests to run on small data (< 1 MB) that you copied from what is provided - avoid pushing large data to github
-    b. Think about the key elements (units can be functions, classes, or modules; multiple of them working together to accomplish a task requires integration testing) of the code base you are working on. Write the following
-      - Unit tests: for individual key functions and classes
-      - Integration tests: for the integration of multiple units working together
-8. After completing the unit and integration tests, merge  the “testing” branch with the main branch
-9. In all cases when you merge, make sure you first do Pull Request, review, then accept the merge.
-10. Use github actions in your repository such that when you git push new code (or merge a branch) to the main branch, the unit test in tests/*.py runs automatically. All tests should pass.
-
-
-After Completing this Challenge, you would have explore  
-
-- Unittesting
-- Modular Coding
-- Software Engineering Best Practices
-- Python Package Structure
-- Bug Fix (Debugging)
-
-Have Fun and Cheers
+# Twitter-Data-Analysis
+
+### So here are the bare minimum requirement for completing this task
+
+1. Use this template to create a repository called Twitter-Data-Analysis in your github account. See ["Creating a repository from a template."](https://docs.github.com/en/articles/creating-a-repository-from-a-template) for more information.
+2. [Download](https://drive.google.com/drive/folders/19G8dmehf9vU0u6VTKGV-yWsQOn3IvPsd) and extract the necessary data and put it in the data directory. The data should not not be added to git tracking.
+3. Create a branch called “bugfix” to fix the bugs in the fix_clean_tweets_dataframe.py and fix_extract_dataframe.py 
+4. In branch “bugfix” use the git mv command to rename fix_clean_tweets_dataframe.py to clean_tweets_dataframe.py and fix_extract_dataframe.py  to extract_dataframe.py 
+5. Fix the bugs on clean_tweets_dataframe.py and extract_dataframe.py 
+6. Multiple times, push the code you are working on to git, and once the fix is complete, merge the fix_bug branch to main branch
+7. Create a new branch called “testing” for updating the unit tests in the test/ folder to be applicable to the code you fixed. 
+    a. Build your unit and integration tests to run on small data (< 1 MB) that you copied from what is provided - avoid pushing large data to github
+    b. Think about the key elements (units can be functions, classes, or modules; multiple of them working together to accomplish a task requires integration testing) of the code base you are working on. Write the following
+      - Unit tests: for individual key functions and classes
+      - Integration tests: for the integration of multiple units working together
+8. After completing the unit and integration tests, merge  the “testing” branch with the main branch
+9. In all cases when you merge, make sure you first do Pull Request, review, then accept the merge.
+10. Use github actions in your repository such that when you git push new code (or merge a branch) to the main branch, the unit test in tests/*.py runs automatically. All tests should pass.
+
+
+After Completing this Challenge, you would have explore  
+
+- Unittesting
+- Modular Coding
+- Software Engineering Best Practices
+- Python Package Structure
+- Bug Fix (Debugging)
+
+Have Fun and Cheers
diff --git a/clean_tweets_dataframe.py b/clean_tweets_dataframe.py
@@ -0,0 +1,143 @@
+import re
+import pandas as pd
+from defaults import *
+
+class Clean_Tweets:
+    """
+    The PEP8 Standard AMAZING!!!
+    """
+    def __init__(self, df:pd.DataFrame):
+        self.df = df
+        print('Automation in Action...!!!')
+
+    def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame:
+        """
+        remove rows that has column names. This error originated from
+        the data collection stage.  
+        """
+        unwanted_rows = self.df[self.df['retweet_count'] == 'retweet_count' ].index
+        self.df.drop(unwanted_rows , inplace=True)
+        self.df = self.df[self.df['polarity'] != 'polarity']
+        return df
+
+    def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame:
+        """
+        drop duplicate rows
+        """
+        self.df.drop_duplicates(subset='original_text', inplace=True)
+        return df
+
+    def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame:
+        """
+        convert column to datetime
+        """
+        self.df['created_at'] = pd.to_datetime(self.df['created_at'], errors='coerce')
+        return df
+
+    def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame:
+        """
+        convert columns like polarity, subjectivity, retweet_count
+        favorite_count etc to numbers
+        """
+        self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce')
+        self.df['subjectivity'] = pd.to_numeric(self.df['subjectivity'],
+                                                errors='coerce')
+        self.df['listed_count'] = pd.to_numeric(self.df['listed_count'],
+                                                errors='coerce')
+        self.df['retweet_count'] = pd.to_numeric(self.df['retweet_count'],
+                                                 errors='coerce')
+        self.df['friends_count'] = pd.to_numeric(self.df['friends_count'],
+                                                 errors='coerce')
+        self.df['favorite_count'] = pd.to_numeric(self.df['favorite_count'],
+                                                  errors='coerce')
+        self.df['statuses_count'] = pd.to_numeric(self.df['statuses_count'],
+                                                  errors='coerce')
+        self.df['followers_count'] = pd.to_numeric(self.df['followers_count'],
+                                                   errors='coerce')
+        self.df['polarity'] = pd.to_numeric(self.df['polarity'],
+                                            errors='coerce')
+        return df
+
+    def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame:
+        """
+        remove non english tweets from lang
+        """
+        self.df.query("lang == 'en'", inplace=True)
+        return df
+
+    def drop_nulls(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        drop nulls
+        """
+        self.df = self.df.dropna(axis=0, how='any', inplace=False)
+        return df
+
+    def find_hashtags(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Method to find hashtags from tweets
+        This function will extract hashtags
+        """
+        self.df = re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', df) 
+        return df
+
+    def text_category(self, series: pd.Series) -> list:
+        """
+        function that return positive, negative or neutral based on polarity
+        """
+        polarities = []
+        for pol in series:
+            if pol >= 0.00000000001:
+                polarities.append("positive")
+            elif pol == 0.00000000000:
+                polarities.append("neutral")
+            elif pol <= -0.00000000001:
+                polarities.append("negative")
+            else:
+                polarities.append('UNK')
+        return polarities
+
+    def fill_missing(self, df: pd.DataFrame, column: str, value):
+        """
+        fill null values of a specific column with the provided value
+        """
+
+        df[column] = df[column].fillna(value)
+
+        return df
+
+    def replace_empty_string(self, df:pd.DataFrame, column: str, value: str):
+        """
+        replace empty strings in a specific column with the provided value
+        """
+
+        df[column] = df[column].apply(lambda x: value if x == "" else x)
+
+        return df
+
+    def remove_characters(self, df: pd.DataFrame, column: str):
+        """
+        removes non-alphanumeric characters with the exception of underscore hyphen and space
+        from the specified column
+        """
+
+        df[column] = df[column].apply(lambda text: re.sub("[^a-zA-Z0-9\s_-]", "", text))
+
+        return df
+
+    def extract_device_name(self, source: str):
+        """
+        returns device name from source text
+        """
+        res = re.split('<|>', source)[2].strip()
+        return 
+
+if __name__ == "__main__":
+    """
+    read the twitter dataset and Pass the data to the Clean_Tweets
+    class
+    """
+    global_tweet_df = pd.read_json(global_data, lines=True)
+    global_cleaner = Clean_Tweets(global_tweet_df)
+
+    african_tweet_df = pd.read_json(african_data, lines=True)
+    african_cleaner = Clean_Tweets(african_tweet_df)
diff --git a/defaults.py b/defaults.py
@@ -0,0 +1,16 @@
+"""
+A script to store all default paths and strings.
+"""
+
+# the global data set
+global_data = 'data/global_twitter_data.json'
+
+# the processed global data set
+processed_global_data = 'data/processed_global_tweet_data.json'
+
+
+# the african data set
+african_data = 'data/africa_twitter_data.json'
+
+# the processed african data set
+processed_african_data = 'data/processed_africa_tweet_data.json'