Bikarhêner:Balyozxane/skrîpt/py/categorizeWithCreator.py

Ji Wîkîpediya, ensîklopediya azad.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
python pwb.py updatewin -file:"categorizewithcreator.py" -s:"skip disambigs"

This bot finds the English Wikipedia counterpart of a non-English Wikipedia
page and fetches its categories. If any of those categories has a counterpart
in the origin Wikipedia, the bot then adds the page to those categories.

Forked to create categories using [[Bikarhêner:Balyozxane/skrîpt/py/category creator.py]]

"""
#
# (C) User:Huji, 2021
# The original version can be found at https://github.com/PersianWikipedia/fawikibot/blob/master/categorize.py
# Distributed under the terms of the MIT license.
#
import pywikibot
from pywikibot import pagegenerators
from functools import lru_cache
from pywikibot.bot import (
    SingleSiteBot,
    ExistingPageBot,
    AutomaticTWSummaryBot,
)
import re
import mytools
import mwparserfromhell
from kucosmetics import CANCEL, CosmeticChangesToolkit
from category_creator import CategoryCreator

# Show help with the parameter -help.
docuReplacements = {"&params;": pagegenerators.parameterHelp}

VERBOSE = False


class CategorizeBot(
    SingleSiteBot,
    ExistingPageBot,
    AutomaticTWSummaryBot,
):
    use_redirects = False  # treats non-redirects only

    update_options = {
        'async': False,
        'showdiff': False,
        'ignore': CANCEL.MATCH,
    }

    def __init__(self, generator, category_creator, **kwargs):
        """
        @param generator: the page generator that determines which pages
            to work on
        @type generator: generator
        @param category_creator: the CategoryCreator instance
        @type category_creator: CategoryCreator
        """
        super(CategorizeBot, self).__init__(site=True, **kwargs)
        self.generator = generator
        self.category_creator = category_creator  # Store the CategoryCreator instance
        self.skip_categories = [
            "Rûpelên ku heman kategoriyê qebûl nakin",
            "Gotara bi soranî",
            "Gotara bi kirmaşanî",
            "Gotara bi kurdiya başûr",
            "Gotara bi zazakî",
            "Rûpelên cudakirinê"
        ]
        self.summary = (
            "[[Bikarhêner:Balyozxane/skrîpt/py/categorizeWithCreator.py|Bot]]: Kategoriyên kêm ji en.wîkiyê lê hatin zêdekirin"
        )
        self.allowednamespaces = [0, 14]
        self.site_ku = pywikibot.Site("ku", 'wikipedia')
        self.site_en = pywikibot.Site("en", 'wikipedia')
        self.remove_parent = False
        self.uncat_templates = mytools.get_template_redirects(self.site_ku, "Bêkategorî")
        # Set VERBOSE based on showdiff option
        global VERBOSE
        VERBOSE = self.opt.get('showdiff', False)

    @staticmethod
    def list_intersection(list1, list2):
        list3 = [value for value in list1 if value in list2]
        return list3

    @lru_cache(maxsize=None)
    def get_existing_cats(self, page):
        """Get a list() of unhidden categories the page is in."""
        cats = mytools.get_unhidden_categories(page.site.code, page.title())
        cat_titles = list()
        for cat in cats:
            cat_page = pywikibot.Page(page.site, cat)
            cat_titles.append(cat_page.title(with_ns=False))
        return cat_titles

    @lru_cache(maxsize=None)
    def check_eligibility(self, candidate):
        """Determine if the category is addable."""
        cat = pywikibot.Category(self.site_ku, "Kategorî:%s" % candidate)
        if not cat.exists():
            return False
        if cat.isHiddenCategory():
            return False

        maint_cats = [
            "Kategoriyên şopandinê",
            "Kategoriyên şitilan",
            "Rûpelên ku heman kategoriyê qebûl nakin"
        ]
        for maint_cat in maint_cats:
            if mytools.is_category_in_page(cat, maint_cat):
                return False

        cat_cats = self.get_existing_cats(cat)
        ineligible_parents = [
            "Bajarên Tirkiyeyê",
            "Tirk li gorî pîşeyan",
            "Çalakvanên tirk",
            "Nivîskarên tirk",
            "Zanyarên tirk",
            "Muzîkvanên tirk",
            "Derhênerên tirk"
        ]
        if len(self.list_intersection(ineligible_parents, cat_cats)) > 0:
            return False
        return True

    @lru_cache(maxsize=None)
    def check_eligibility_en(self, candidate):
        """Determine if the category is addable."""
        cat = pywikibot.Category(self.site_en, "Category:%s" % candidate)
        if cat.isHiddenCategory():
            return False
        if mytools.is_category_in_page(cat, "Tracking categories"):
            return False
        cat_cats = self.get_existing_cats(cat)
        ineligible_parents = [
            "Turkish people by occupation",
            "Cities in Turkey",
            "Turkish Kurdistan",
            "Iraqi Kurdistan",
            "Syrian Kurdistan",
            "Iranian Kurdistan",
            "Stub categories"
        ]
        if len(self.list_intersection(ineligible_parents, cat_cats)) > 0:
            return False
        return True

    @lru_cache(maxsize=None)
    def is_child_category_of(self, child, parent):
        child_cat = pywikibot.Page(self.site_ku, "Kategorî:%s" % child)
        child_cat_cats = self.get_existing_cats(child_cat)
        if parent in child_cat_cats:
            return True
        return False

    def do_kozmetik(self, old_text):
        kozmetik_cebu = ""
        cc_toolkit = CosmeticChangesToolkit(self.current_page,
                                            ignore=self.opt.ignore)
        new_text, summaries = cc_toolkit.change(old_text)
        applies_summaries = ', '.join(summaries.values())
        if new_text is not False and new_text != old_text:
            kozmetik_cebu = "; paqijiyên kozmetîk"
            if applies_summaries:
                kozmetik_cebu += f' ({applies_summaries}.)'

        return new_text, kozmetik_cebu

    def remove_uncat(self, old_text):

        parsed = mwparserfromhell.parse(old_text)
        removed_template = ""
        for template in parsed.filter_templates():
            template_name = template.name.strip()
            template_name = template_name[0].upper() + template_name[1:]
            if template_name in self.uncat_templates:
                parsed.remove(template)
                removed_template = "; {{[[Şablon:Bêkategorî|bêkategorî]]}} hat rakirin"

        new_text = str(parsed)
        return new_text, removed_template

    def treat_page(self):
        """Process the current page that the bot is working on."""
        page = self.current_page

        if VERBOSE:
            print(f"page.title: {page.title()}")

        if page.isDisambig() or page.namespace() not in self.allowednamespaces:
            if VERBOSE:
                pywikibot.output("Namespace not allowed! or dismabig page")
            return

        check_current_categories = self.get_existing_cats(page)
        if len(set(self.skip_categories) & set(check_current_categories)) > 0:
            if VERBOSE:
                pywikibot.output("Page disallows this bot; skipped.")
            return

        if mytools.is_category_in_page(page, "Rûpelên ku heman kategoriyê qebûl nakin"):
            if VERBOSE:
                pywikibot.output("Page disallows this bot; skipped.")
            return

        enwiki_title = mytools.get_sitelink("kuwiki", "enwiki", page.title())

        if enwiki_title is None:
            if VERBOSE:
                pywikibot.output("No interwiki link to enwiki; skipped.")
            return False

        if VERBOSE:
            print(f"enwiki_title: {enwiki_title}")

        remote_page = pywikibot.Page(self.site_en, enwiki_title)

        if remote_page.isRedirectPage() or remote_page.isDisambig():
            if VERBOSE:
                pywikibot.output("Target page is a redirect or disambig; skipped.")
            return

        current_categories = self.get_existing_cats(page)
        remote_categories = self.get_existing_cats(remote_page)
        added_categories = list()
        removed_categories = list()

        for remote_cat in remote_categories:
            if VERBOSE:
                print(f"remote_category: {remote_cat}")

            try:
                if self.check_eligibility_en(remote_cat) is False:
                    continue
                candidate = None
                ku_cat_title = mytools.get_sitelink("enwiki", "kuwiki", "Category:" + remote_cat)
                if ku_cat_title:
                    candidate = ku_cat_title.replace("Kategorî:", "")

                if VERBOSE:
                    print(f"candidate: {candidate}")

                if candidate is None:
                    category_created = self.category_creator.create_category(remote_cat)

                    # Check if the category is created successfully
                    if category_created:
                        if category_created not in current_categories:
                            added_categories.append(category_created)
                    else:
                        if VERBOSE:
                            pywikibot.output(f"Failed to create category for {remote_cat}")
                        continue

                if candidate not in current_categories:
                    if VERBOSE:
                        print(f"{candidate} not in {current_categories}")

                    if self.check_eligibility(candidate):
                        # If a child of this category is already used, don't add it
                        skip_less_specific = False
                        for cc in current_categories:
                            if self.is_child_category_of(cc, candidate):
                                skip_less_specific = True
                                if VERBOSE:
                                    pywikibot.output(
                                        "More specific category already used."
                                    )

                        # Otherwise add this category
                        if skip_less_specific is False:
                            added_categories.append(candidate)

                        # If a parent of what you just added is used, remove it
                        if self.remove_parent is True:
                            candidate_fullname = "Kategorî:%s" % candidate
                            candidate_page = pywikibot.Page(
                                self.site_ku,
                                candidate_fullname
                            )
                            candidate_parents = self.get_existing_cats(
                                candidate_page
                            )
                            intersection = self.list_intersection(
                                candidate_parents,
                                current_categories)
                            if len(intersection) > 0:
                                if VERBOSE:
                                    pywikibot.output("Removing less specific parent.")
                                removed_categories.extend(intersection)
                else:
                    if VERBOSE:
                        print(f"{candidate} is in current categories")

            except pywikibot.exceptions.UnknownSiteError as e:
                # Handle the specific exception and continue the loop
                pywikibot.warning(f"Skipping unknown site: {e}")
                continue
            except Exception as e:
                # Handle any other exceptions that might occur during the loop
                pywikibot.error(f"Error processing remote category: {e}")
        if len(added_categories) > 0:
            text = page.text
            for ac in added_categories:
                text += "\n[[Kategorî:%s]]" % ac

            if len(removed_categories) > 0:
                for remote_cat in removed_categories:
                    rc_pattern = r"\n\[\[Kategorî:" + remote_cat + r"(\|[^\]]*)?\]\]"
                    text = re.sub(rc_pattern, "", text)

            new_text, removed_template = self.remove_uncat(text)
            if removed_template:
                text = new_text

            cleaned_new_text, kozmetik_cebu = self.do_kozmetik(text)
            summary = f"{self.summary}{removed_template}{kozmetik_cebu}"

            self.put_current(
                cleaned_new_text,
                summary=summary,
                asynchronous=self.opt['async'],
                show_diff=self.opt['showdiff']
            )


def main(*args: str) -> None:
    """
    Process command line arguments and invoke bot.

    If args is an empty list, sys.argv is used.

    :param args: command line arguments
    """
    options = {}
    # Process global arguments to determine desired site
    local_args = pywikibot.handle_args(args)

    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    gen_factory = pagegenerators.GeneratorFactory()

    # Process pagegenerators arguments
    local_args = gen_factory.handle_args(local_args)

    # Parse your own command line arguments
    for arg in local_args:
        arg, _, value = arg.partition(':')
        option = arg[1:]
        if option in ('-always', '-async', '-showdiff'):
            options[option[1:]] = True
        elif option == '-ignore':
            value = value.upper()
            try:
                options['ignore'] = getattr(CANCEL, value)
            except AttributeError:
                raise ValueError(f'Unknown ignore mode {value!r}!')
        # take the remaining options as booleans.
        # You will get a hint if they aren't pre-defined in your bot class
        else:
            options[option] = True

    category_creator = CategoryCreator()
    gen = gen_factory.getCombinedGenerator(preload=True)

    # check if further help is needed
    if not pywikibot.bot.suggest_help(missing_generator=not gen):
        # pass generator and private options to the bot
        bot = CategorizeBot(gen, category_creator, **options)
        bot.run()


if __name__ == "__main__":
    main()