#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bots finds the English Wikipedia counterpart of a non-English Wikipedia
page and fetches its categories. If any of those categories has a counterpart
in the origin Wikipedia, the bot then adds the page to those categories.
Forked to create categories using [[Bikarhêner:Balyozxane/skrîpt/py/category creator.py]]
"""
#
# (C) User:Huji, 2021
# The original version can be found at https://github.com/PersianWikipedia/fawikibot/blob/master/categorize.py
# Distributed under the terms of the MIT license.
#
import pywikibot
from category_creator import CategoryCreator # Import the CategoryCreator class
from pywikibot import pagegenerators
from functools import lru_cache
from pywikibot.bot import (
SingleSiteBot,
ExistingPageBot,
NoRedirectPageBot,
AutomaticTWSummaryBot,
)
import re
import requests
import mwparserfromhell
from kucosmetics import CANCEL, CosmeticChangesToolkit
import urllib.parse
# Show help with the parameter -help.
docuReplacements = {"¶ms;": pagegenerators.parameterHelp}
VERBOSE = False
def get_enwiki_title(kuwiki_title):
encoded_title = urllib.parse.quote(kuwiki_title)
url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites=kuwiki&titles={encoded_title}&props=sitelinks&format=json"
response = requests.get(url)
data = response.json()
entity = next(iter(data["entities"].values())) # Get the first (and only) entity
if "sitelinks" in entity and "enwiki" in entity["sitelinks"]:
enwiki_title = entity["sitelinks"]["enwiki"]["title"]
return enwiki_title
else:
return None
def get_kuwiki_title(enwiki_title):
encoded_title = urllib.parse.quote(enwiki_title)
url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles={encoded_title}&props=sitelinks&format=json"
response = requests.get(url)
data = response.json()
entity = next(iter(data["entities"].values())) # Get the first (and only) entity
if "sitelinks" in entity and "kuwiki" in entity["sitelinks"]:
kuwiki_title = entity["sitelinks"]["kuwiki"]["title"]
if VERBOSE:
print(f"Kurdish title for '{enwiki_title}' fetched: '{kuwiki_title}'")
kuwiki_title = kuwiki_title.replace('Kategorî:', '')
return kuwiki_title
else:
if VERBOSE:
print(f"Kurdish title for '{enwiki_title}' not found.")
return None
class CategorizeBot(
SingleSiteBot,
ExistingPageBot,
AutomaticTWSummaryBot,
):
use_redirects = False # treats non-redirects only
update_options = {
'async': False,
'showdiff': False,
'ignore': CANCEL.MATCH,
}
def __init__(self, generator, category_creator, **kwargs):
"""
@param generator: the page generator that determines which pages
to work on
@type generator: generator
@param category_creator: the CategoryCreator instance
@type category_creator: CategoryCreator
"""
super(CategorizeBot, self).__init__(site=True, **kwargs)
self.generator = generator
self.category_creator = category_creator # Store the CategoryCreator instance
self.skip_categories = [
"Rûpelên ku heman kategoriyê qebûl nakin",
"Gotara bi soranî",
"Gotara bi kirmaşanî",
"Gotara bi kurdiya başûr",
"Gotara bi zazakî",
"Rûpelên cudakirinê"
]
self.summary = (
"[[Bikarhêner:Balyozxane/skrîpt/py/categorizeWithCreator.py|Bot]]: Kategoriyên kêm ji en.wîkiyê lê hatin zêdekirin"
)
self.allowednamespaces = [0, 14]
self.site_fa = pywikibot.Site("ku")
self.site_en = pywikibot.Site("en")
self.remove_parent = False
self.uncat_templates = [redirect.lower() for redirect in self.get_template_redirects("Bêkategorî")]
def get_template_redirects(self, template_title):
template_title = "Şablon:" + template_title
template_page = pywikibot.Page(self.site, template_title)
redirects = template_page.backlinks(filter_redirects=True, namespaces=[10])
redirect_titles = [redirect.title(with_ns=False) for redirect in redirects]
redirect_titles.append(template_title.split(":")[-1])
if VERBOSE:
print(f"{template_title} redirects:\n{redirect_titles}")
return redirect_titles
@staticmethod
def list_intersection(list1, list2):
list3 = [value for value in list1 if value in list2]
return list3
@lru_cache(maxsize=None)
def get_existing_cats(self, page):
"""Get a list() of categories the page is in."""
cats = list(page.categories())
cat_titles = list()
for c in cats:
cat_titles.append(c.title(with_ns=False))
return cat_titles
@lru_cache(maxsize=None)
def check_eligibility(self, candidate):
"""Determine if the category is addable."""
cat = pywikibot.Page(self.site_fa, "Kategorî:%s" % candidate)
if not cat.exists():
return False
cat_cats = self.get_existing_cats(cat)
ineligible_parents = [
"Kategoriyên veşartî",
"Tracking_categories",
"Kategoriyên şitilan",
"Beralîkirinên kategoriyan",
"Infobox mapframe without OSM relation ID on Wikidata",
"Bajarên Tirkiyeyê",
"Tirk li gorî pîşeyan",
"Çalakvanên tirk",
"Nivîskarên tirk",
"Zanyarên tirk",
"Muzîkvanên tirk",
"Derhênerên tirk",
"Rûpelên ku heman kategoriyê qebûl nakin"
]
if len(self.list_intersection(ineligible_parents, cat_cats)) > 0:
return False
return True
@lru_cache(maxsize=None)
def check_eligibility_en(self, candidate):
"""Determine if the category is addable."""
cat = pywikibot.Page(self.site_en, "Category:%s" % candidate)
cat_cats = self.get_existing_cats(cat)
ineligible_parents = [
"Hidden categories",
"Tracking categories",
"Turkish people by occupation",
"Cities in Turkey",
"Turkish Kurdistan",
"Iraqi Kurdistan",
"Syrian Kurdistan",
"Iranian Kurdistan",
"Stub categories"
]
if len(self.list_intersection(ineligible_parents, cat_cats)) > 0:
return False
return True
@lru_cache(maxsize=None)
def is_child_category_of(self, child, parent):
child_cat = pywikibot.Page(self.site_fa, "Kategorî:%s" % child)
child_cat_cats = self.get_existing_cats(child_cat)
if parent in child_cat_cats:
return True
return False
def do_kozmetik(self, old_text):
kozmetik_cebu = ""
cc_toolkit = CosmeticChangesToolkit(self.current_page,
ignore=self.opt.ignore)
new_text, summaries = cc_toolkit.change(old_text)
applies_summaries = ', '.join(summaries.values())
if new_text is not False and new_text != old_text:
kozmetik_cebu = "; paqijiyên kozmetîk"
if applies_summaries:
kozmetik_cebu += f' ({applies_summaries}.)'
return new_text, kozmetik_cebu
def remove_uncat(self, old_text):
parsed = mwparserfromhell.parse(old_text)
removed_template = ""
for template in parsed.filter_templates():
template_name = template.name.strip().lower() # Convert template name to lowercase
if template_name in self.uncat_templates:
parsed.remove(template)
removed_template = "; {{[[Şablon:Bêkategorî|bêkategorî]]}} hat rakirin"
new_next = str(parsed)
return new_next, removed_template
def treat_page(self):
"""Process the current page that the bot is working on."""
page = self.current_page
if VERBOSE:
print(f"page.title: {page.title()}")
if page.namespace() not in self.allowednamespaces:
if VERBOSE:
pywikibot.output("Namespace not allowed!")
return False
current_categories = self.get_existing_cats(page)
if len(set(self.skip_categories) & set(current_categories)) > 0:
if VERBOSE:
pywikibot.output("Page disallows this bot; skipped.")
return
enwiki_title = get_enwiki_title(page.title())
if enwiki_title is None:
if VERBOSE:
pywikibot.output("No interwiki link to enwiki; skipped.")
return False
if VERBOSE:
print(f"enwiki_title: {enwiki_title}")
remote_page = pywikibot.Page(self.site_en, enwiki_title)
if VERBOSE:
print(f"remote_page: {remote_page}")
if remote_page.isRedirectPage():
if VERBOSE:
pywikibot.output("Target page is a redirect; skipped.")
return False
remote_categories = list(remote_page.categories())
added_categories = list()
removed_categories = list()
for rc in remote_categories:
if VERBOSE:
print(f"remote_category: {rc}")
try:
if self.check_eligibility_en(rc.title(with_ns=False)) is False:
continue
candidate = None
ku_cat_title = get_kuwiki_title(rc.title())
if ku_cat_title:
candidate = ku_cat_title
if VERBOSE:
print(f"candidate: {candidate}")
if candidate is None:
category_created = self.category_creator.create_category(rc.title(with_ns=False))
# Check if the category is created successfully
if category_created:
if category_created not in current_categories:
added_categories.append(category_created)
else:
if VERBOSE:
pywikibot.output(f"Failed to create category for {rc.title()}")
continue
if candidate not in current_categories:
if VERBOSE:
print(f"{candidate} not in {current_categories}")
if self.check_eligibility(candidate):
# If a child of this category is already used, don't add it
skip_less_specific = False
for cc in current_categories:
if self.is_child_category_of(cc, candidate):
skip_less_specific = True
if VERBOSE:
pywikibot.output(
"More specific category already used."
)
# Otherwise add this category
if skip_less_specific is False:
added_categories.append(candidate)
# If a parent of what you just added is used, remove it
if self.remove_parent is True:
candidate_fullname = "Kategorî:%s" % candidate
candidate_page = pywikibot.Page(
self.site_fa,
candidate_fullname
)
candidate_parents = self.get_existing_cats(
candidate_page
)
intersection = self.list_intersection(
candidate_parents,
current_categories)
if len(intersection) > 0:
if VERBOSE:
pywikibot.output("Removing less specific parent.")
removed_categories.extend(intersection)
else:
if VERBOSE:
print(f"{candidate} is in current categories")
except pywikibot.exceptions.UnknownSiteError as e:
# Handle the specific exception and continue the loop
pywikibot.warning(f"Skipping unknown site: {e}")
continue
except Exception as e:
# Handle any other exceptions that might occur during the loop
pywikibot.error(f"Error processing remote category: {e}")
if len(added_categories) > 0:
text = page.text
for ac in added_categories:
text += "\n[[Kategorî:%s]]" % ac
if len(removed_categories) > 0:
for rc in removed_categories:
rc_pattern = r"\n\[\[Kategorî:" + rc + r"(\|[^\]]*)?\]\]"
text = re.sub(rc_pattern, "", text)
new_text, removed_template = self.remove_uncat(text)
if removed_template:
text = new_text
cleaned_new_text, kozmetik_cebu = self.do_kozmetik(text)
summary = f"{self.summary}{removed_template}{kozmetik_cebu}"
self.put_current(
cleaned_new_text,
summary=summary,
asynchronous=self.opt['async'],
show_diff=self.opt['showdiff']
)
def main(*args):
"""
Process command line arguments and invoke bot.
@param args: command line arguments
@type args: list of unicode
"""
options = {}
# Process global arguments to determine desired site
local_args = pywikibot.handle_args(args)
# Process pagegenerators arguments
gen_factory = pagegenerators.GeneratorFactory()
local_args = gen_factory.handle_args(local_args)
# Parse command line arguments
for arg in local_args:
arg, sep, value = arg.partition(":")
option = arg[1:]
if option in ('-always', '-async', '-showdiff'):
options[option[1:]] = True
elif option == '-ignore':
value = value.upper()
try:
options['ignore'] = getattr(CANCEL, value)
except AttributeError:
raise ValueError(f'Unknown ignore mode {value!r}!')
# Take the remaining options as booleans.
else:
options[option] = True
category_creator = CategoryCreator()
gen = gen_factory.getCombinedGenerator(preload=True)
if gen:
bot = CategorizeBot(gen, category_creator, **options)
bot.run()
return True
else:
pywikibot.bot.suggest_help(missing_generator=True)
return False
if __name__ == "__main__":
main()