Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
hmhard committed Aug 11, 2024
0 parents commit ce4ca83
Show file tree
Hide file tree
Showing 22 changed files with 1,537,042 additions and 0 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python application

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
#- name: Test with pytest
#run: |
#pytest
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
files
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# [Tikvah](https://t.me/s/tikvahethiopia) Telegram channel analysis repo
! this repo analysis is done for **learning purpose**
23 changes: 23 additions & 0 deletions clean_stop_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import json

data_file_path = 'cleared_unwanted_keys.json'
keys_to_remove_file_path = 'stop_words.json'
output_file_path = 'final_filtered_data.json'

with open(data_file_path, 'r', encoding='utf-8') as file:
key_value_pairs = json.load(file)

with open(keys_to_remove_file_path, 'r', encoding='utf-8') as file:
keys_to_remove = json.load(file)


key_value_dict = {k: v for k, v in key_value_pairs.items()}

filtered_dict = {k: v for k, v in key_value_dict.items() if k not in keys_to_remove}

filtered_key_value_pairs = [{k:v} for k, v in filtered_dict.items()]

with open(output_file_path, 'w', encoding='utf-8') as file:
json.dump(filtered_dict, file, ensure_ascii=False, indent=4)

print(f"Filtered data saved to {output_file_path}")
38 changes: 38 additions & 0 deletions clear-non-alpha.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import json
import re

file_path = 'word_dictionary.json'

valid_keys_file = 'valid_keys.json'
invalid_keys_file = 'invalid_keys.json'

with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)

symbols_pattern = re.compile(r'^[\W_]+$', re.UNICODE) # Matches only non-alphanumeric characters (symbols)
numbers_pattern = re.compile(r'^\d+$') # Matches only numbers
emoji_pattern = re.compile("[\U00010000-\U0010FFFF]", flags=re.UNICODE) # Matches emojis

valid_keys = {}
invalid_keys = {}

def is_only_emoji(s):
return all(emoji_pattern.match(c) for c in s)

# Separate keys based on whether they are only symbols, only numbers, or only emojis
for k, v in data.items():
if symbols_pattern.match(k) or numbers_pattern.match(k) or is_only_emoji(k):
invalid_keys[k] = v
else:
valid_keys[k] = v

# Save valid keys to a JSON file
with open(valid_keys_file, 'w', encoding='utf-8') as file:
json.dump(valid_keys, file, ensure_ascii=False, indent=4)

# Save invalid keys to a JSON file
with open(invalid_keys_file, 'w', encoding='utf-8') as file:
json.dump(invalid_keys, file, ensure_ascii=False, indent=4)

print(f"Valid keys saved to {valid_keys_file}")
print(f"Invalid keys saved to {invalid_keys_file}")
Loading

0 comments on commit ce4ca83

Please sign in to comment.