Bikarhêner:Balyozxane/skrîpt/py/categorizeWithCreator.py

Ji Wîkîpediya, ensîklopediya azad.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bots finds the English Wikipedia counterpart of a non-English Wikipedia
page and fetches its categories. If any of those categories has a counterpart
in the origin Wikipedia, the bot then adds the page to those categories.

Forked to create categories using [[Bikarhêner:Balyozxane/skrîpt/py/category creator.py]]
"""
#
# (C) User:Huji, 2021
# The original version can be found at https://github.com/PersianWikipedia/fawikibot/blob/master/categorize.py
# Distributed under the terms of the MIT license.
#
import pywikibot
from category_creator import CategoryCreator  # Import the CategoryCreator class
from pywikibot import pagegenerators
from functools import lru_cache
from pywikibot.bot import (
    SingleSiteBot,
    ExistingPageBot,
    NoRedirectPageBot,
    AutomaticTWSummaryBot,
)
import re
import requests
import mwparserfromhell
from kucosmetics import CANCEL, CosmeticChangesToolkit
import urllib.parse

# Show help with the parameter -help.
docuReplacements = {"&params;": pagegenerators.parameterHelp}

VERBOSE = False


def get_enwiki_title(kuwiki_title):
    encoded_title = urllib.parse.quote(kuwiki_title)
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites=kuwiki&titles={encoded_title}&props=sitelinks&format=json"
    response = requests.get(url)
    data = response.json()
    entity = next(iter(data["entities"].values()))  # Get the first (and only) entity
    if "sitelinks" in entity and "enwiki" in entity["sitelinks"]:
        enwiki_title = entity["sitelinks"]["enwiki"]["title"]
        return enwiki_title
    else:
        return None


def get_kuwiki_title(enwiki_title):
    encoded_title = urllib.parse.quote(enwiki_title)
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles={encoded_title}&props=sitelinks&format=json"
    response = requests.get(url)
    data = response.json()
    entity = next(iter(data["entities"].values()))  # Get the first (and only) entity
    if "sitelinks" in entity and "kuwiki" in entity["sitelinks"]:
        kuwiki_title = entity["sitelinks"]["kuwiki"]["title"]

        if VERBOSE:
            print(f"Kurdish title for '{enwiki_title}' fetched: '{kuwiki_title}'")

        kuwiki_title = kuwiki_title.replace('Kategorî:', '')
        return kuwiki_title
    else:
        if VERBOSE:
            print(f"Kurdish title for '{enwiki_title}' not found.")
        return None


class CategorizeBot(
    SingleSiteBot,
    ExistingPageBot,
    AutomaticTWSummaryBot,
):
    use_redirects = False  # treats non-redirects only

    update_options = {
        'async': False,
        'showdiff': False,
        'ignore': CANCEL.MATCH,
    }

    def __init__(self, generator, category_creator, **kwargs):
        """
        @param generator: the page generator that determines which pages
            to work on
        @type generator: generator
        @param category_creator: the CategoryCreator instance
        @type category_creator: CategoryCreator
        """
        super(CategorizeBot, self).__init__(site=True, **kwargs)
        self.generator = generator
        self.category_creator = category_creator  # Store the CategoryCreator instance
        self.skip_categories = [
            "Rûpelên ku heman kategoriyê qebûl nakin",
            "Gotara bi soranî",
            "Gotara bi kirmaşanî",
            "Gotara bi kurdiya başûr",
            "Gotara bi zazakî",
            "Rûpelên cudakirinê"
        ]
        self.summary = (
            "[[Bikarhêner:Balyozxane/skrîpt/py/categorizeWithCreator.py|Bot]]: Kategoriyên kêm ji en.wîkiyê lê hatin zêdekirin"
        )
        self.allowednamespaces = [0, 14]
        self.site_fa = pywikibot.Site("ku")
        self.site_en = pywikibot.Site("en")
        self.remove_parent = False
        self.uncat_templates = [redirect.lower() for redirect in self.get_template_redirects("Bêkategorî")]

    def get_template_redirects(self, template_title):
        template_title = "Şablon:" + template_title
        template_page = pywikibot.Page(self.site, template_title)
        redirects = template_page.backlinks(filter_redirects=True, namespaces=[10])
        redirect_titles = [redirect.title(with_ns=False) for redirect in redirects]
        redirect_titles.append(template_title.split(":")[-1])

        if VERBOSE:
            print(f"{template_title} redirects:\n{redirect_titles}")
        return redirect_titles

    @staticmethod
    def list_intersection(list1, list2):
        list3 = [value for value in list1 if value in list2]
        return list3

    @lru_cache(maxsize=None)
    def get_existing_cats(self, page):
        """Get a list() of categories the page is in."""
        cats = list(page.categories())
        cat_titles = list()
        for c in cats:
            cat_titles.append(c.title(with_ns=False))
        return cat_titles

    @lru_cache(maxsize=None)
    def check_eligibility(self, candidate):
        """Determine if the category is addable."""
        cat = pywikibot.Page(self.site_fa, "Kategorî:%s" % candidate)
        if not cat.exists():
            return False
        cat_cats = self.get_existing_cats(cat)
        ineligible_parents = [
            "Kategoriyên veşartî",
            "Tracking_categories",
            "Kategoriyên şitilan",
            "Beralîkirinên kategoriyan",
            "Infobox mapframe without OSM relation ID on Wikidata",
            "Bajarên Tirkiyeyê",
            "Tirk li gorî pîşeyan",
            "Çalakvanên tirk",
            "Nivîskarên tirk",
            "Zanyarên tirk",
            "Muzîkvanên tirk",
            "Derhênerên tirk",
            "Rûpelên ku heman kategoriyê qebûl nakin"
        ]
        if len(self.list_intersection(ineligible_parents, cat_cats)) > 0:
            return False
        return True

    @lru_cache(maxsize=None)
    def check_eligibility_en(self, candidate):
        """Determine if the category is addable."""
        cat = pywikibot.Page(self.site_en, "Category:%s" % candidate)
        cat_cats = self.get_existing_cats(cat)
        ineligible_parents = [
            "Hidden categories",
            "Tracking categories",
            "Turkish people by occupation",
            "Cities in Turkey",
            "Turkish Kurdistan",
            "Iraqi Kurdistan",
            "Syrian Kurdistan",
            "Iranian Kurdistan",
            "Stub categories"
        ]
        if len(self.list_intersection(ineligible_parents, cat_cats)) > 0:
            return False
        return True

    @lru_cache(maxsize=None)
    def is_child_category_of(self, child, parent):
        child_cat = pywikibot.Page(self.site_fa, "Kategorî:%s" % child)
        child_cat_cats = self.get_existing_cats(child_cat)
        if parent in child_cat_cats:
            return True
        return False

    def do_kozmetik(self, old_text):
        kozmetik_cebu = ""
        cc_toolkit = CosmeticChangesToolkit(self.current_page,
                                            ignore=self.opt.ignore)
        new_text, summaries = cc_toolkit.change(old_text)
        applies_summaries = ', '.join(summaries.values())
        if new_text is not False and new_text != old_text:
            kozmetik_cebu = "; paqijiyên kozmetîk"
            if applies_summaries:
                kozmetik_cebu += f' ({applies_summaries}.)'

        return new_text, kozmetik_cebu

    def remove_uncat(self, old_text):

        parsed = mwparserfromhell.parse(old_text)
        removed_template = ""
        for template in parsed.filter_templates():
            template_name = template.name.strip().lower()  # Convert template name to lowercase
            if template_name in self.uncat_templates:
                parsed.remove(template)
                removed_template = "; {{[[Şablon:Bêkategorî|bêkategorî]]}} hat rakirin"

        new_next = str(parsed)
        return new_next, removed_template

    def treat_page(self):
        """Process the current page that the bot is working on."""
        page = self.current_page

        if VERBOSE:
            print(f"page.title: {page.title()}")

        if page.namespace() not in self.allowednamespaces:
            if VERBOSE:
                pywikibot.output("Namespace not allowed!")
            return False

        current_categories = self.get_existing_cats(page)
        if len(set(self.skip_categories) & set(current_categories)) > 0:
            if VERBOSE:
                pywikibot.output("Page disallows this bot; skipped.")
            return

        enwiki_title = get_enwiki_title(page.title())

        if enwiki_title is None:
            if VERBOSE:
                pywikibot.output("No interwiki link to enwiki; skipped.")
            return False

        if VERBOSE:
            print(f"enwiki_title: {enwiki_title}")

        remote_page = pywikibot.Page(self.site_en, enwiki_title)

        if VERBOSE:
            print(f"remote_page: {remote_page}")

        if remote_page.isRedirectPage():
            if VERBOSE:
                pywikibot.output("Target page is a redirect; skipped.")
            return False

        remote_categories = list(remote_page.categories())
        added_categories = list()
        removed_categories = list()

        for rc in remote_categories:
            if VERBOSE:
                print(f"remote_category: {rc}")

            try:
                if self.check_eligibility_en(rc.title(with_ns=False)) is False:
                    continue
                candidate = None
                ku_cat_title = get_kuwiki_title(rc.title())
                if ku_cat_title:
                    candidate = ku_cat_title

                if VERBOSE:
                    print(f"candidate: {candidate}")

                if candidate is None:
                    category_created = self.category_creator.create_category(rc.title(with_ns=False))

                    # Check if the category is created successfully
                    if category_created:
                        if category_created not in current_categories:
                            added_categories.append(category_created)
                    else:
                        if VERBOSE:
                            pywikibot.output(f"Failed to create category for {rc.title()}")
                        continue

                if candidate not in current_categories:
                    if VERBOSE:
                        print(f"{candidate} not in {current_categories}")

                    if self.check_eligibility(candidate):
                        # If a child of this category is already used, don't add it
                        skip_less_specific = False
                        for cc in current_categories:
                            if self.is_child_category_of(cc, candidate):
                                skip_less_specific = True
                                if VERBOSE:
                                    pywikibot.output(
                                        "More specific category already used."
                                    )

                        # Otherwise add this category
                        if skip_less_specific is False:
                            added_categories.append(candidate)

                        # If a parent of what you just added is used, remove it
                        if self.remove_parent is True:
                            candidate_fullname = "Kategorî:%s" % candidate
                            candidate_page = pywikibot.Page(
                                self.site_fa,
                                candidate_fullname
                            )
                            candidate_parents = self.get_existing_cats(
                                candidate_page
                            )
                            intersection = self.list_intersection(
                                candidate_parents,
                                current_categories)
                            if len(intersection) > 0:
                                if VERBOSE:
                                    pywikibot.output("Removing less specific parent.")
                                removed_categories.extend(intersection)
                else:
                    if VERBOSE:
                        print(f"{candidate} is in current categories")

            except pywikibot.exceptions.UnknownSiteError as e:
                # Handle the specific exception and continue the loop
                pywikibot.warning(f"Skipping unknown site: {e}")
                continue
            except Exception as e:
                # Handle any other exceptions that might occur during the loop
                pywikibot.error(f"Error processing remote category: {e}")
        if len(added_categories) > 0:
            text = page.text
            for ac in added_categories:
                text += "\n[[Kategorî:%s]]" % ac

            if len(removed_categories) > 0:
                for rc in removed_categories:
                    rc_pattern = r"\n\[\[Kategorî:" + rc + r"(\|[^\]]*)?\]\]"
                    text = re.sub(rc_pattern, "", text)

            new_text, removed_template = self.remove_uncat(text)
            if removed_template:
                text = new_text

            cleaned_new_text, kozmetik_cebu = self.do_kozmetik(text)
            summary = f"{self.summary}{removed_template}{kozmetik_cebu}"

            self.put_current(
                cleaned_new_text,
                summary=summary,
                asynchronous=self.opt['async'],
                show_diff=self.opt['showdiff']
            )


def main(*args):
    """
    Process command line arguments and invoke bot.
    @param args: command line arguments
    @type args: list of unicode
    """
    options = {}

    # Process global arguments to determine desired site
    local_args = pywikibot.handle_args(args)

    # Process pagegenerators arguments
    gen_factory = pagegenerators.GeneratorFactory()
    local_args = gen_factory.handle_args(local_args)

    # Parse command line arguments
    for arg in local_args:
        arg, sep, value = arg.partition(":")
        option = arg[1:]
        if option in ('-always', '-async', '-showdiff'):
            options[option[1:]] = True
        elif option == '-ignore':
            value = value.upper()
            try:
                options['ignore'] = getattr(CANCEL, value)
            except AttributeError:
                raise ValueError(f'Unknown ignore mode {value!r}!')

        # Take the remaining options as booleans.
        else:
            options[option] = True
    category_creator = CategoryCreator()
    gen = gen_factory.getCombinedGenerator(preload=True)
    if gen:
        bot = CategorizeBot(gen, category_creator, **options)
        bot.run()
        return True
    else:
        pywikibot.bot.suggest_help(missing_generator=True)
        return False


if __name__ == "__main__":
    main()