Export data to Commons (#65)

WikiMovimentoBrasil · Mar 5, 2025 · 1008d94 · 1008d94
1 parent 3b66bd0
commit 1008d94
Show file tree

Hide file tree

Showing 12 changed files with 599 additions and 4 deletions.
diff --git a/.github/workflows/django.yml b/.github/workflows/django.yml
@@ -37,6 +37,8 @@ jobs:
         echo "SOCIAL_AUTH_MEDIAWIKI_KEY = 'key'" >> CapX/settings_local.py
         echo "SOCIAL_AUTH_MEDIAWIKI_URL = 'https://localhost:8000/w/api.php'" >> CapX/settings_local.py
         echo "SOCIAL_AUTH_MEDIAWIKI_SECRET = 'text'" >> CapX/settings_local.py
+        echo "CAPX_BOT_USERNAME = 'CapXBot'" >> CapX/settings_local.py
+        echo "CAPX_BOT_PASSWORD = 'password'" >> CapX/settings_local.py
         echo "EMAIL_HOST = 'localhost'" >> CapX/settings_local.py
         echo "EMAIL_PORT = 25" >> CapX/settings_local.py
         echo "SERVER_EMAIL = 'root@localhost'" >> CapX/settings_local.py

diff --git a/CapX/settings.py b/CapX/settings.py
@@ -14,7 +14,8 @@
 from datetime import timedelta
 from CapX.settings_local import (DEBUG, ALLOWED_HOSTS, SOCIAL_AUTH_MEDIAWIKI_CALLBACK, 
     DATABASES, BASE_DIR, SECRET_KEY, SOCIAL_AUTH_MEDIAWIKI_URL, SOCIAL_AUTH_MEDIAWIKI_KEY, 
-    SOCIAL_AUTH_MEDIAWIKI_SECRET, LANGUAGES, EMAIL_HOST, EMAIL_PORT, SERVER_EMAIL, ADMINS)
+    SOCIAL_AUTH_MEDIAWIKI_SECRET, LANGUAGES, EMAIL_HOST, EMAIL_PORT, SERVER_EMAIL, ADMINS,
+    CAPX_BOT_USERNAME, CAPX_BOT_PASSWORD)
 
 
 # Application definition

diff --git a/CapX/settings_local.py b/CapX/settings_local.py
@@ -10,6 +10,8 @@
 SOCIAL_AUTH_MEDIAWIKI_URL = 'https://meta.wikimedia.org/w/index.php'
 SOCIAL_AUTH_MEDIAWIKI_KEY = os.environ.get("SOCIAL_AUTH_MEDIAWIKI_KEY")
 SOCIAL_AUTH_MEDIAWIKI_SECRET = os.environ.get("SOCIAL_AUTH_MEDIAWIKI_SECRET")
+CAPX_BOT_USERNAME = os.environ.get("CAPX_BOT_USERNAME")
+CAPX_BOT_PASSWORD = os.environ.get("CAPX_BOT_PASSWORD")
 LANGUAGES = (
     ('en', 'English'),
     ('pt-br', 'Brazilian Portuguese'),

diff --git a/README.md b/README.md
@@ -82,6 +82,8 @@ This will create a SQLite file in your source directory.
    SOCIAL_AUTH_MEDIAWIKI_KEY="<YOUR MEDIAWIKI OAUTH KEY>"
    SOCIAL_AUTH_MEDIAWIKI_SECRET="<YOUR MEDIAWIKI OAUTH SECRET>"
    SECRET_KEY="<CREATE YOUR OWN RANDOM KEY>"
+   CAPX_BOT_USERNAME="<YOUR BOT USERNAME>"
+   CAPX_BOT_PASSWORD="<YOUR BOT PASSWORD>"
    ```
 
 To create a new OAuth app and key, go to https://meta.wikimedia.org/wiki/Special:OAuthConsumerRegistration .  For local development, you can enable "This consumer is for use only by <your username>" and "User identify verification only".

diff --git a/users/management/__init__.py b/users/management/__init__.py
diff --git a/users/management/commands/__init__.py b/users/management/commands/__init__.py
diff --git a/users/management/commands/export.py b/users/management/commands/export.py
@@ -0,0 +1,277 @@
+from django.core.management.base import BaseCommand
+from users.serializers import ProfileSerializer
+from users.models import Profile, DataHash
+from skills.models import Skill
+from django.conf import settings
+import json
+import requests
+import hashlib
+import os
+
+class Command(BaseCommand):
+    help = "Export data to Commons in JSON tabular format"
+
+    def format_list(self, data_list):
+        return '[' + ', '.join(str(item) for item in data_list) + ']'
+
+    def get_meta_wiki_users(self):
+        query_params = {
+            'action': 'query',
+            'prop': 'transcludedin',
+            'pageids': '12993801',  # Template:CapacityExchange
+            'tilimit': 'max',
+            'tiprop': 'title',
+            'tinamespace': '2',
+            'format': 'json',
+            'formatversion': '2',
+        }
+        response = requests.get('https://meta.wikimedia.org/w/api.php', params=query_params)
+        if self.verbosity >= 2:
+            self.stdout.write(f"Meta wiki users response: {response.json()}")
+        return [page['title'][5:] for page in response.json()['query']['pages'][0]['transcludedin']]
+
+    def process_profiles(self, profiles, meta_wiki_users):
+        formatted_data = []
+        skills = []
+        processed_usernames = set()
+
+        # First pass - process regular usernames
+        for profile in profiles:
+            username = profile['user']['username']
+            if username in meta_wiki_users:
+                data = [
+                    username,
+                    self.format_list(profile['skills_known']),
+                    self.format_list(profile['skills_available']),
+                    self.format_list(profile['skills_wanted'])
+                ]
+                formatted_data.append(data)
+                processed_usernames.add(username)
+
+                skills.extend(profile['skills_known'])
+                skills.extend(profile['skills_available'])
+                skills.extend(profile['skills_wanted'])
+
+        # Second pass - process alternative usernames if not already processed
+        for profile in profiles:
+            if 'wiki_alt' in profile and profile['wiki_alt'] and profile['wiki_alt'] in meta_wiki_users:
+                alt_username = profile['wiki_alt']
+                if alt_username not in processed_usernames:
+                    data = [
+                        alt_username,
+                        self.format_list(profile['skills_known']),
+                        self.format_list(profile['skills_available']),
+                        self.format_list(profile['skills_wanted'])
+                    ]
+                    formatted_data.append(data)
+                    processed_usernames.add(alt_username)
+
+                    skills.extend(profile['skills_known'])
+                    skills.extend(profile['skills_available'])
+                    skills.extend(profile['skills_wanted'])
+
+        if self.verbosity >= 2:
+            self.stdout.write(f"Processed profiles: {formatted_data}")
+            self.stdout.write(f"Skills: {skills}")
+
+        return formatted_data, list(set(skills))
+
+    def get_skill_dict(self, skills):
+        skill_dict = {Skill.objects.get(id=skill).skill_wikidata_item: skill for skill in skills}
+        if self.verbosity >= 2:
+            self.stdout.write(f"Skill dictionary: {skill_dict}")
+        return skill_dict
+
+    def get_sparql_query(self, quids):
+        query = """
+        PREFIX wbt: <https://metabase.wikibase.cloud/prop/direct/>
+        SELECT ?item ?itemLabel ?itemDescription ?value WHERE {
+            VALUES ?value { %s }
+            ?item wbt:P1 ?value.
+        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
+        }
+        """
+        sparql_query = query % ' '.join([f'"{quid}"' for quid in quids])
+        if self.verbosity >= 2:
+            self.stdout.write(f"SPARQL query: {sparql_query}")
+        return sparql_query
+
+    def process_sparql_response(self, response, skill_dict):
+        formatted_data = []
+        for item in response.json()['results']['bindings']:
+            data = [
+                skill_dict[item['value']['value']],
+                item['itemLabel']['value'] if 'itemLabel' in item else '',
+                item['itemDescription']['value'] if 'itemDescription' in item else ''
+            ]
+            formatted_data.append(data)
+        if self.verbosity >= 2:
+            self.stdout.write(f"SPARQL response data: {formatted_data}")
+        return formatted_data
+
+    def create_output_users(self, formatted_data):
+        output_users = {
+            "license": "CC0-1.0",
+            "description": {"en": "Users enrolled in the CapX platform"},
+            "sources": "https://capx.toolforge.org",
+            "schema": {
+                "fields": [
+                    {"name": "username", "type": "string"},
+                    {"name": "skills_known", "type": "string"},
+                    {"name": "skills_available", "type": "string"},
+                    {"name": "skills_wanted", "type": "string"}
+                ],
+            },
+            "data": formatted_data,
+        }
+        if self.verbosity >= 2:
+            self.stdout.write(f"Output users: {output_users}")
+        return output_users
+
+    def create_output_capacities(self, formatted_data):
+        output_capacities = {
+            "license": "CC0-1.0",
+            "description": {"en": "Capacities added in the CapX platform"},
+            "sources": "https://capx.toolforge.org",
+            "schema": {
+                "fields": [
+                    {"name": "id", "type": "number"},
+                    {"name": "name", "type": "string"},
+                    {"name": "description", "type": "string"}
+                ],
+            },
+            "data": formatted_data,
+        }
+        if self.verbosity >= 2:
+            self.stdout.write(f"Output capacities: {output_capacities}")
+        return output_capacities
+
+    def get_login_token(self, session, url):
+        params = {
+            "action": "query",
+            "meta": "tokens",
+            "type": "login",
+            "format": "json"
+        }
+        response = session.get(url=url, params=params)
+        data = response.json()
+        if self.verbosity >= 2:
+            self.stdout.write(f"Login token response: {data}")
+        return data['query']['tokens']['logintoken']
+
+    def login(self, session, url, login_token):
+        params = {
+            "action": "login",
+            "lgname": settings.CAPX_BOT_USERNAME,
+            "lgpassword": settings.CAPX_BOT_PASSWORD,
+            "lgtoken": login_token,
+            "format": "json"
+        }
+        response = session.post(url, data=params).json()
+        if response['login']['result'] != 'Success':
+            raise requests.exceptions.RequestException("Login failed")
+        if self.verbosity >= 2:
+            self.stdout.write(f"Login response: {response}")
+        return response
+
+    def get_csrf_token(self, session, url):
+        params = {
+            "action": "query",
+            "meta": "tokens",
+            "format": "json"
+        }
+        response = session.get(url=url, params=params)
+        data = response.json()
+        if self.verbosity >= 2:
+            self.stdout.write(f"CSRF token response: {data}")
+        return data['query']['tokens']['csrftoken']
+
+    def edit_page(self, session, url, title, summary, text, csrf_token):
+        params = {
+            "action": "edit",
+            "title": title,
+            "summary": summary,
+            "text": text,
+            "token": csrf_token,
+            "minor": "1",
+            "format": "json"
+        }
+        if self.verbosity >= 2:
+            self.stdout.write(f"Editing page {title} with text: {text}")
+
+        response = session.post(url, data=params)
+        return response.json()
+
+    def hash_data(self, data):
+        hash_value = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8')).hexdigest()
+        if self.verbosity >= 2:
+            self.stdout.write(f"Hashed data: {hash_value}")
+        return hash_value
+
+    def get_previous_hash(self, data_type):
+        try:
+            hash_value = DataHash.objects.get(data_type=data_type).hash_value
+            if self.verbosity >= 2:
+                self.stdout.write(f"Previous hash for {data_type}: {hash_value}")
+            return hash_value
+        except DataHash.DoesNotExist:
+            if self.verbosity >= 2:
+                self.stdout.write(f"No previous hash found for {data_type}")
+            return None
+
+    def save_current_hash(self, data_type, hash_value):
+        data_hash, _ = DataHash.objects.update_or_create(
+            data_type=data_type,
+            defaults={'hash_value': hash_value}
+        )
+        if self.verbosity >= 2:
+            self.stdout.write(f"Saved current hash for {data_type}: {hash_value}")
+        return data_hash
+
+    def handle(self, *args, **options):
+        self.verbosity = options.get('verbosity', 1)
+        profile_serializer = ProfileSerializer(Profile.objects.all(), many=True)
+        meta_wiki_users = self.get_meta_wiki_users()
+        formatted_data, skills = self.process_profiles(profile_serializer.data, meta_wiki_users)
+        output_users = self.create_output_users(formatted_data)
+
+        skill_dict = self.get_skill_dict(skills)
+        quids = list(skill_dict.keys())
+        sparql_query = self.get_sparql_query(quids)
+        response = requests.get(
+            'https://metabase.wikibase.cloud/query/sparql',
+            params={'query': sparql_query, 'format': 'json'}
+        )
+        formatted_data = self.process_sparql_response(response, skill_dict)
+        output_capacities = self.create_output_capacities(formatted_data)
+
+        # Hash current data
+        current_users_hash = self.hash_data(output_users)
+        current_capacities_hash = self.hash_data(output_capacities)
+
+        # Get previous hashes from the database
+        previous_users_hash = self.get_previous_hash('users')
+        previous_capacities_hash = self.get_previous_hash('capacities')
+
+        # Check if data has changed
+        if current_users_hash != previous_users_hash or current_capacities_hash != previous_capacities_hash:
+            session = requests.Session()
+            url = "https://commons.wikimedia.org/w/api.php"
+            login_token = self.get_login_token(session, url)
+            self.login(session, url, login_token)
+
+            if current_users_hash != previous_users_hash:
+                csrf_token = self.get_csrf_token(session, url)
+                self.edit_page(
+                    session, url, "Data:CapacityExchange/users.tab", "Updating data",
+                    json.dumps(output_users, indent=4), csrf_token
+                )
+                self.save_current_hash('users', current_users_hash)
+
+            if current_capacities_hash != previous_capacities_hash:
+                csrf_token = self.get_csrf_token(session, url)
+                self.edit_page(
+                    session, url, "Data:CapacityExchange/capacities.tab", "Updating data",
+                    json.dumps(output_capacities, indent=4), csrf_token
+                )
+                self.save_current_hash('capacities', current_capacities_hash)
diff --git a/users/migrations/0017_datahash.py b/users/migrations/0017_datahash.py
@@ -0,0 +1,22 @@
+# Generated by Django 4.2.11 on 2025-03-04 18:39
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('users', '0016_alter_profile_avatar'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='DataHash',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('data_type', models.CharField(max_length=50, unique=True)),
+                ('hash_value', models.CharField(max_length=32)),
+                ('updated_at', models.DateTimeField(auto_now=True)),
+            ],
+        ),
+    ]
diff --git a/users/models.py b/users/models.py
@@ -5,7 +5,7 @@
 from django.db.models.signals import post_save
 from orgs.models import Organization
 from skills.models import Skill
-from users.submodels import Territory, Language, WikimediaProject, Avatar
+from users.submodels import Territory, Language, WikimediaProject, Avatar, DataHash
 from django.core.validators import RegexValidator
 
 

diff --git a/users/submodels.py b/users/submodels.py
@@ -88,4 +88,12 @@ class AuthExtraInfo(models.Model):
     created_at = models.DateTimeField(
         verbose_name="Created at",
         auto_now_add=True
-    )
+    )
+
+class DataHash(models.Model):
+    data_type = models.CharField(max_length=50, unique=True)
+    hash_value = models.CharField(max_length=32)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    def __str__(self):
+        return f"{self.data_type}: {self.hash_value}"