Skip to content

Commit

Permalink
Export data to Commons (#65)
Browse files Browse the repository at this point in the history
  • Loading branch information
albertoleoncio authored Mar 5, 2025
1 parent 3b66bd0 commit 1008d94
Show file tree
Hide file tree
Showing 12 changed files with 599 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/django.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ jobs:
echo "SOCIAL_AUTH_MEDIAWIKI_KEY = 'key'" >> CapX/settings_local.py
echo "SOCIAL_AUTH_MEDIAWIKI_URL = 'https://localhost:8000/w/api.php'" >> CapX/settings_local.py
echo "SOCIAL_AUTH_MEDIAWIKI_SECRET = 'text'" >> CapX/settings_local.py
echo "CAPX_BOT_USERNAME = 'CapXBot'" >> CapX/settings_local.py
echo "CAPX_BOT_PASSWORD = 'password'" >> CapX/settings_local.py
echo "EMAIL_HOST = 'localhost'" >> CapX/settings_local.py
echo "EMAIL_PORT = 25" >> CapX/settings_local.py
echo "SERVER_EMAIL = 'root@localhost'" >> CapX/settings_local.py
Expand Down
3 changes: 2 additions & 1 deletion CapX/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from datetime import timedelta
from CapX.settings_local import (DEBUG, ALLOWED_HOSTS, SOCIAL_AUTH_MEDIAWIKI_CALLBACK,
DATABASES, BASE_DIR, SECRET_KEY, SOCIAL_AUTH_MEDIAWIKI_URL, SOCIAL_AUTH_MEDIAWIKI_KEY,
SOCIAL_AUTH_MEDIAWIKI_SECRET, LANGUAGES, EMAIL_HOST, EMAIL_PORT, SERVER_EMAIL, ADMINS)
SOCIAL_AUTH_MEDIAWIKI_SECRET, LANGUAGES, EMAIL_HOST, EMAIL_PORT, SERVER_EMAIL, ADMINS,
CAPX_BOT_USERNAME, CAPX_BOT_PASSWORD)


# Application definition
Expand Down
2 changes: 2 additions & 0 deletions CapX/settings_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
SOCIAL_AUTH_MEDIAWIKI_URL = 'https://meta.wikimedia.org/w/index.php'
SOCIAL_AUTH_MEDIAWIKI_KEY = os.environ.get("SOCIAL_AUTH_MEDIAWIKI_KEY")
SOCIAL_AUTH_MEDIAWIKI_SECRET = os.environ.get("SOCIAL_AUTH_MEDIAWIKI_SECRET")
CAPX_BOT_USERNAME = os.environ.get("CAPX_BOT_USERNAME")
CAPX_BOT_PASSWORD = os.environ.get("CAPX_BOT_PASSWORD")
LANGUAGES = (
('en', 'English'),
('pt-br', 'Brazilian Portuguese'),
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ This will create a SQLite file in your source directory.
SOCIAL_AUTH_MEDIAWIKI_KEY="<YOUR MEDIAWIKI OAUTH KEY>"
SOCIAL_AUTH_MEDIAWIKI_SECRET="<YOUR MEDIAWIKI OAUTH SECRET>"
SECRET_KEY="<CREATE YOUR OWN RANDOM KEY>"
CAPX_BOT_USERNAME="<YOUR BOT USERNAME>"
CAPX_BOT_PASSWORD="<YOUR BOT PASSWORD>"
```

To create a new OAuth app and key, go to https://meta.wikimedia.org/wiki/Special:OAuthConsumerRegistration . For local development, you can enable "This consumer is for use only by <your username>" and "User identify verification only".
Expand Down
Empty file added users/management/__init__.py
Empty file.
Empty file.
277 changes: 277 additions & 0 deletions users/management/commands/export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
from django.core.management.base import BaseCommand
from users.serializers import ProfileSerializer
from users.models import Profile, DataHash
from skills.models import Skill
from django.conf import settings
import json
import requests
import hashlib
import os

class Command(BaseCommand):
help = "Export data to Commons in JSON tabular format"

def format_list(self, data_list):
return '[' + ', '.join(str(item) for item in data_list) + ']'

def get_meta_wiki_users(self):
query_params = {
'action': 'query',
'prop': 'transcludedin',
'pageids': '12993801', # Template:CapacityExchange
'tilimit': 'max',
'tiprop': 'title',
'tinamespace': '2',
'format': 'json',
'formatversion': '2',
}
response = requests.get('https://meta.wikimedia.org/w/api.php', params=query_params)
if self.verbosity >= 2:
self.stdout.write(f"Meta wiki users response: {response.json()}")
return [page['title'][5:] for page in response.json()['query']['pages'][0]['transcludedin']]

def process_profiles(self, profiles, meta_wiki_users):
formatted_data = []
skills = []
processed_usernames = set()

# First pass - process regular usernames
for profile in profiles:
username = profile['user']['username']
if username in meta_wiki_users:
data = [
username,
self.format_list(profile['skills_known']),
self.format_list(profile['skills_available']),
self.format_list(profile['skills_wanted'])
]
formatted_data.append(data)
processed_usernames.add(username)

skills.extend(profile['skills_known'])
skills.extend(profile['skills_available'])
skills.extend(profile['skills_wanted'])

# Second pass - process alternative usernames if not already processed
for profile in profiles:
if 'wiki_alt' in profile and profile['wiki_alt'] and profile['wiki_alt'] in meta_wiki_users:
alt_username = profile['wiki_alt']
if alt_username not in processed_usernames:
data = [
alt_username,
self.format_list(profile['skills_known']),
self.format_list(profile['skills_available']),
self.format_list(profile['skills_wanted'])
]
formatted_data.append(data)
processed_usernames.add(alt_username)

skills.extend(profile['skills_known'])
skills.extend(profile['skills_available'])
skills.extend(profile['skills_wanted'])

if self.verbosity >= 2:
self.stdout.write(f"Processed profiles: {formatted_data}")
self.stdout.write(f"Skills: {skills}")

return formatted_data, list(set(skills))

def get_skill_dict(self, skills):
skill_dict = {Skill.objects.get(id=skill).skill_wikidata_item: skill for skill in skills}
if self.verbosity >= 2:
self.stdout.write(f"Skill dictionary: {skill_dict}")
return skill_dict

def get_sparql_query(self, quids):
query = """
PREFIX wbt: <https://metabase.wikibase.cloud/prop/direct/>
SELECT ?item ?itemLabel ?itemDescription ?value WHERE {
VALUES ?value { %s }
?item wbt:P1 ?value.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""
sparql_query = query % ' '.join([f'"{quid}"' for quid in quids])
if self.verbosity >= 2:
self.stdout.write(f"SPARQL query: {sparql_query}")
return sparql_query

def process_sparql_response(self, response, skill_dict):
formatted_data = []
for item in response.json()['results']['bindings']:
data = [
skill_dict[item['value']['value']],
item['itemLabel']['value'] if 'itemLabel' in item else '',
item['itemDescription']['value'] if 'itemDescription' in item else ''
]
formatted_data.append(data)
if self.verbosity >= 2:
self.stdout.write(f"SPARQL response data: {formatted_data}")
return formatted_data

def create_output_users(self, formatted_data):
output_users = {
"license": "CC0-1.0",
"description": {"en": "Users enrolled in the CapX platform"},
"sources": "https://capx.toolforge.org",
"schema": {
"fields": [
{"name": "username", "type": "string"},
{"name": "skills_known", "type": "string"},
{"name": "skills_available", "type": "string"},
{"name": "skills_wanted", "type": "string"}
],
},
"data": formatted_data,
}
if self.verbosity >= 2:
self.stdout.write(f"Output users: {output_users}")
return output_users

def create_output_capacities(self, formatted_data):
output_capacities = {
"license": "CC0-1.0",
"description": {"en": "Capacities added in the CapX platform"},
"sources": "https://capx.toolforge.org",
"schema": {
"fields": [
{"name": "id", "type": "number"},
{"name": "name", "type": "string"},
{"name": "description", "type": "string"}
],
},
"data": formatted_data,
}
if self.verbosity >= 2:
self.stdout.write(f"Output capacities: {output_capacities}")
return output_capacities

def get_login_token(self, session, url):
params = {
"action": "query",
"meta": "tokens",
"type": "login",
"format": "json"
}
response = session.get(url=url, params=params)
data = response.json()
if self.verbosity >= 2:
self.stdout.write(f"Login token response: {data}")
return data['query']['tokens']['logintoken']

def login(self, session, url, login_token):
params = {
"action": "login",
"lgname": settings.CAPX_BOT_USERNAME,
"lgpassword": settings.CAPX_BOT_PASSWORD,
"lgtoken": login_token,
"format": "json"
}
response = session.post(url, data=params).json()
if response['login']['result'] != 'Success':
raise requests.exceptions.RequestException("Login failed")
if self.verbosity >= 2:
self.stdout.write(f"Login response: {response}")
return response

def get_csrf_token(self, session, url):
params = {
"action": "query",
"meta": "tokens",
"format": "json"
}
response = session.get(url=url, params=params)
data = response.json()
if self.verbosity >= 2:
self.stdout.write(f"CSRF token response: {data}")
return data['query']['tokens']['csrftoken']

def edit_page(self, session, url, title, summary, text, csrf_token):
params = {
"action": "edit",
"title": title,
"summary": summary,
"text": text,
"token": csrf_token,
"minor": "1",
"format": "json"
}
if self.verbosity >= 2:
self.stdout.write(f"Editing page {title} with text: {text}")

response = session.post(url, data=params)
return response.json()

def hash_data(self, data):
hash_value = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8')).hexdigest()
if self.verbosity >= 2:
self.stdout.write(f"Hashed data: {hash_value}")
return hash_value

def get_previous_hash(self, data_type):
try:
hash_value = DataHash.objects.get(data_type=data_type).hash_value
if self.verbosity >= 2:
self.stdout.write(f"Previous hash for {data_type}: {hash_value}")
return hash_value
except DataHash.DoesNotExist:
if self.verbosity >= 2:
self.stdout.write(f"No previous hash found for {data_type}")
return None

def save_current_hash(self, data_type, hash_value):
data_hash, _ = DataHash.objects.update_or_create(
data_type=data_type,
defaults={'hash_value': hash_value}
)
if self.verbosity >= 2:
self.stdout.write(f"Saved current hash for {data_type}: {hash_value}")
return data_hash

def handle(self, *args, **options):
self.verbosity = options.get('verbosity', 1)
profile_serializer = ProfileSerializer(Profile.objects.all(), many=True)
meta_wiki_users = self.get_meta_wiki_users()
formatted_data, skills = self.process_profiles(profile_serializer.data, meta_wiki_users)
output_users = self.create_output_users(formatted_data)

skill_dict = self.get_skill_dict(skills)
quids = list(skill_dict.keys())
sparql_query = self.get_sparql_query(quids)
response = requests.get(
'https://metabase.wikibase.cloud/query/sparql',
params={'query': sparql_query, 'format': 'json'}
)
formatted_data = self.process_sparql_response(response, skill_dict)
output_capacities = self.create_output_capacities(formatted_data)

# Hash current data
current_users_hash = self.hash_data(output_users)
current_capacities_hash = self.hash_data(output_capacities)

# Get previous hashes from the database
previous_users_hash = self.get_previous_hash('users')
previous_capacities_hash = self.get_previous_hash('capacities')

# Check if data has changed
if current_users_hash != previous_users_hash or current_capacities_hash != previous_capacities_hash:
session = requests.Session()
url = "https://commons.wikimedia.org/w/api.php"
login_token = self.get_login_token(session, url)
self.login(session, url, login_token)

if current_users_hash != previous_users_hash:
csrf_token = self.get_csrf_token(session, url)
self.edit_page(
session, url, "Data:CapacityExchange/users.tab", "Updating data",
json.dumps(output_users, indent=4), csrf_token
)
self.save_current_hash('users', current_users_hash)

if current_capacities_hash != previous_capacities_hash:
csrf_token = self.get_csrf_token(session, url)
self.edit_page(
session, url, "Data:CapacityExchange/capacities.tab", "Updating data",
json.dumps(output_capacities, indent=4), csrf_token
)
self.save_current_hash('capacities', current_capacities_hash)
22 changes: 22 additions & 0 deletions users/migrations/0017_datahash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 4.2.11 on 2025-03-04 18:39

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('users', '0016_alter_profile_avatar'),
]

operations = [
migrations.CreateModel(
name='DataHash',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('data_type', models.CharField(max_length=50, unique=True)),
('hash_value', models.CharField(max_length=32)),
('updated_at', models.DateTimeField(auto_now=True)),
],
),
]
2 changes: 1 addition & 1 deletion users/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from django.db.models.signals import post_save
from orgs.models import Organization
from skills.models import Skill
from users.submodels import Territory, Language, WikimediaProject, Avatar
from users.submodels import Territory, Language, WikimediaProject, Avatar, DataHash
from django.core.validators import RegexValidator


Expand Down
10 changes: 9 additions & 1 deletion users/submodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,12 @@ class AuthExtraInfo(models.Model):
created_at = models.DateTimeField(
verbose_name="Created at",
auto_now_add=True
)
)

class DataHash(models.Model):
data_type = models.CharField(max_length=50, unique=True)
hash_value = models.CharField(max_length=32)
updated_at = models.DateTimeField(auto_now=True)

def __str__(self):
return f"{self.data_type}: {self.hash_value}"
Loading

0 comments on commit 1008d94

Please sign in to comment.