Bikarhêner:Balyozxane/skrîpt/py/categorizeWithCreator.py
Xuyakirin
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
python pwb.py updatewin -file:"categorizewithcreator.py" -s:"redirect?"
This bot finds the English Wikipedia counterpart of a non-English Wikipedia
page and fetches its categories. If any of those categories has a counterpart
in the origin Wikipedia, the bot then adds the page to those categories.
Forked to create categories using [[Bikarhêner:Balyozxane/skrîpt/py/category creator.py]]
"""
#
# (C) User:Huji, 2021
# The original version can be found at https://github.com/PersianWikipedia/fawikibot/blob/master/categorize.py
# Distributed under the terms of the MIT license.
#
import pywikibot
from pywikibot import pagegenerators
from functools import lru_cache
from pywikibot.bot import (
SingleSiteBot,
ExistingPageBot,
AutomaticTWSummaryBot,
)
import re
import mytools
import mwparserfromhell
from kucosmetics import CANCEL, CosmeticChangesToolkit
from category_creator import CategoryCreator
# Show help with the parameter -help.
docuReplacements = {"¶ms;": pagegenerators.parameterHelp}
VERBOSE = False
class CategorizeBot(
SingleSiteBot,
ExistingPageBot,
AutomaticTWSummaryBot,
):
use_redirects = False # treats non-redirects only
update_options = {
'async': False,
'showdiff': False,
'ignore': CANCEL.MATCH,
}
def __init__(self, generator, category_creator, **kwargs):
"""
@param generator: the page generator that determines which pages
to work on
@type generator: generator
@param category_creator: the CategoryCreator instance
@type category_creator: CategoryCreator
"""
super(CategorizeBot, self).__init__(site=True, **kwargs)
self.generator = generator
self.category_creator = category_creator # Store the CategoryCreator instance
self.skip_categories = [
"Rûpelên ku heman kategoriyê qebûl nakin",
"Gotara bi soranî",
"Gotara bi kirmaşanî",
"Gotara bi kurdiya başûr",
"Gotara bi zazakî",
"Rûpelên cudakirinê"
]
self.summary = (
"[[Bikarhêner:Balyozxane/skrîpt/py/categorizeWithCreator.py|Bot]]: Kategoriyên kêm ji en.wîkiyê lê hatin zêdekirin"
)
self.allowednamespaces = [0, 14]
self.site_ku = pywikibot.Site("ku", 'wikipedia')
self.site_en = pywikibot.Site("en", 'wikipedia')
self.remove_parent = False
self.uncat_templates = mytools.get_template_redirects(self.site_ku, "Bêkategorî")
# Set VERBOSE based on showdiff option
global VERBOSE
VERBOSE = self.opt.get('showdiff', False)
@staticmethod
def list_intersection(list1, list2):
list3 = [value for value in list1 if value in list2]
return list3
@lru_cache(maxsize=None)
def get_existing_cats(self, page):
"""Get a list() of unhidden categories the page is in."""
cats = mytools.get_unhidden_categories(page.site.code, page.title(), withNS=True)
cat_titles = list()
for cat in cats:
cat_page = pywikibot.Page(page.site, cat)
cat_titles.append(cat_page.title(with_ns=False))
return cat_titles
@lru_cache(maxsize=None)
def check_eligibility(self, candidate):
"""Determine if the category is addable."""
cat = pywikibot.Category(self.site_ku, "Kategorî:%s" % candidate)
if not cat.exists():
return False
if cat.isHiddenCategory():
return False
maint_cats = [
"Kategoriyên şopandinê",
"Kategoriyên şitilan",
"Rûpelên ku heman kategoriyê qebûl nakin"
]
for maint_cat in maint_cats:
if mytools.is_category_in_page(cat, maint_cat):
return False
cat_cats = self.get_existing_cats(cat)
ineligible_parents = [
"Bajarên Tirkiyeyê",
"Tirk li gorî pîşeyan",
"Çalakvanên tirk",
"Nivîskarên tirk",
"Zanyarên tirk",
"Muzîkvanên tirk",
"Derhênerên tirk"
]
if len(self.list_intersection(ineligible_parents, cat_cats)) > 0:
return False
return True
@lru_cache(maxsize=None)
def check_eligibility_en(self, candidate):
"""Determine if the category is addable."""
cat = pywikibot.Category(self.site_en, "Category:%s" % candidate)
if cat.isHiddenCategory():
return False
if mytools.is_category_in_page(cat, "Tracking categories"):
return False
cat_cats = self.get_existing_cats(cat)
ineligible_parents = [
"Turkish people by occupation",
"Cities in Turkey",
"Turkish Kurdistan",
"Iraqi Kurdistan",
"Syrian Kurdistan",
"Iranian Kurdistan",
"Stub categories"
]
if len(self.list_intersection(ineligible_parents, cat_cats)) > 0:
return False
return True
@lru_cache(maxsize=None)
def is_child_category_of(self, child, parent):
child_cat = pywikibot.Page(self.site_ku, "Kategorî:%s" % child)
child_cat_cats = self.get_existing_cats(child_cat)
if parent in child_cat_cats:
return True
return False
def do_kozmetik(self, old_text):
kozmetik_cebu = ""
cc_toolkit = CosmeticChangesToolkit(self.current_page,
ignore=self.opt.ignore)
new_text, summaries = cc_toolkit.change(old_text)
applies_summaries = ', '.join(summaries.values())
if new_text is not False and new_text != old_text:
kozmetik_cebu = "; paqijiyên kozmetîk"
if applies_summaries:
kozmetik_cebu += f' ({applies_summaries}.)'
return new_text, kozmetik_cebu
def remove_uncat(self, old_text):
parsed = mwparserfromhell.parse(old_text)
removed_template = ""
for template in parsed.filter_templates():
template_name = template.name.strip()
template_name = template_name[0].upper() + template_name[1:]
if template_name in self.uncat_templates:
parsed.remove(template)
removed_template = "; {{[[Şablon:Bêkategorî|bêkategorî]]}} hat rakirin"
new_text = str(parsed)
return new_text, removed_template
def treat_page(self):
"""Process the current page that the bot is working on."""
page = self.current_page
if VERBOSE:
print(f"page.title: {page.title()}")
if page.isDisambig() or page.namespace() not in self.allowednamespaces:
if VERBOSE:
pywikibot.output("Namespace not allowed! or dismabig page")
return
check_current_categories = self.get_existing_cats(page)
if len(set(self.skip_categories) & set(check_current_categories)) > 0:
if VERBOSE:
pywikibot.output("Page disallows this bot; skipped.")
return
if mytools.is_category_in_page(page, "Rûpelên ku heman kategoriyê qebûl nakin"):
if VERBOSE:
pywikibot.output("Page disallows this bot; skipped.")
return
enwiki_title = mytools.get_sitelink("kuwiki", "enwiki", page.title())
if enwiki_title is None:
if VERBOSE:
pywikibot.output("No interwiki link to enwiki; skipped.")
return False
if VERBOSE:
print(f"enwiki_title: {enwiki_title}")
remote_page = pywikibot.Page(self.site_en, enwiki_title)
if remote_page.isRedirectPage() or remote_page.isDisambig():
if VERBOSE:
pywikibot.output("Target page is a redirect or disambig; skipped.")
return
current_categories = self.get_existing_cats(page)
remote_categories = self.get_existing_cats(remote_page)
added_categories = list()
removed_categories = list()
for remote_cat in remote_categories:
if VERBOSE:
print(f"remote_category: {remote_cat}")
try:
if self.check_eligibility_en(remote_cat) is False:
continue
candidate = None
ku_cat_title = mytools.get_sitelink("enwiki", "kuwiki", "Category:" + remote_cat)
if ku_cat_title:
ku_cat_page = pywikibot.Category(self.site, ku_cat_title)
if ku_cat_page.isCategoryRedirect() is True:
print("redirect")
ku_cat_title = ku_cat_page.getCategoryRedirectTarget().title()
candidate = ku_cat_title.replace("Kategorî:", "")
if VERBOSE:
print(f"candidate: {candidate}")
if candidate is None:
category_created = self.category_creator.create_category(remote_cat)
# Check if the category is created successfully
if category_created:
if category_created not in current_categories:
added_categories.append(category_created)
else:
if VERBOSE:
pywikibot.output(f"Failed to create category for {remote_cat}")
continue
if candidate not in current_categories:
if VERBOSE:
print(f"{candidate} not in {current_categories}")
if self.check_eligibility(candidate):
# If a child of this category is already used, don't add it
skip_less_specific = False
for cc in current_categories:
if self.is_child_category_of(cc, candidate):
skip_less_specific = True
if VERBOSE:
pywikibot.output(
"More specific category already used."
)
# Otherwise add this category
if skip_less_specific is False:
added_categories.append(candidate)
# If a parent of what you just added is used, remove it
if self.remove_parent is True:
candidate_fullname = "Kategorî:%s" % candidate
candidate_page = pywikibot.Page(
self.site_ku,
candidate_fullname
)
candidate_parents = self.get_existing_cats(
candidate_page
)
intersection = self.list_intersection(
candidate_parents,
current_categories)
if len(intersection) > 0:
if VERBOSE:
pywikibot.output("Removing less specific parent.")
removed_categories.extend(intersection)
else:
if VERBOSE:
print(f"{candidate} is in current categories")
except pywikibot.exceptions.UnknownSiteError as e:
# Handle the specific exception and continue the loop
pywikibot.warning(f"Skipping unknown site: {e}")
continue
except Exception as e:
# Handle any other exceptions that might occur during the loop
pywikibot.error(f"Error processing remote category: {e}")
if len(added_categories) > 0:
text = page.text
for ac in added_categories:
text += "\n[[Kategorî:%s]]" % ac
if len(removed_categories) > 0:
for remote_cat in removed_categories:
rc_pattern = r"\n\[\[Kategorî:" + remote_cat + r"(\|[^\]]*)?\]\]"
text = re.sub(rc_pattern, "", text)
new_text, removed_template = self.remove_uncat(text)
if removed_template:
text = new_text
cleaned_new_text, kozmetik_cebu = self.do_kozmetik(text)
summary = f"{self.summary}{removed_template}{kozmetik_cebu}"
self.put_current(
cleaned_new_text,
summary=summary,
asynchronous=self.opt['async'],
show_diff=self.opt['showdiff']
)
def main(*args: str) -> None:
"""
Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
:param args: command line arguments
"""
options = {}
# Process global arguments to determine desired site
local_args = pywikibot.handle_args(args)
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
gen_factory = pagegenerators.GeneratorFactory()
# Process pagegenerators arguments
local_args = gen_factory.handle_args(local_args)
# Parse your own command line arguments
for arg in local_args:
arg, _, value = arg.partition(':')
option = arg[1:]
if option in ('-always', '-async', '-showdiff'):
options[option[1:]] = True
elif option == '-ignore':
value = value.upper()
try:
options['ignore'] = getattr(CANCEL, value)
except AttributeError:
raise ValueError(f'Unknown ignore mode {value!r}!')
# take the remaining options as booleans.
# You will get a hint if they aren't pre-defined in your bot class
else:
options[option] = True
category_creator = CategoryCreator()
gen = gen_factory.getCombinedGenerator(preload=True)
# check if further help is needed
if not pywikibot.bot.suggest_help(missing_generator=not gen):
# pass generator and private options to the bot
bot = CategorizeBot(gen, category_creator, **options)
bot.run()
if __name__ == "__main__":
main()