Bikarhêner:Balyozxane/skrîpt/py/cavkanileke.py

#!/usr/bin/python3
"""
Fork of [[:mw:Manual:Pywikibot/noreferences.py]]
This script adds a missing references section to pages.

It goes over multiple pages, searches for pages where <references />
is missing although a <ref> tag is present, and in that case adds a new
references section.

These command line parameters can be used to specify which pages to work on:

&params;

Furthermore, the following command line parameters are supported:

-xml          Retrieve information from a local XML dump (pages-articles
              or pages-meta-current, see https://dumps.wikimedia.org).
              Argument can also be given as "-xml:filename".

-always       Don't prompt you for each replacement.

-quiet        Use this option to get less output

If neither a page title nor a page generator is given, it takes all pages from
the default maintenance category.

It is strongly recommended not to run this script over the entire article
namespace (using the -start) parameter, as that would consume too much
bandwidth. Instead, use the -xml parameter, or use another way to generate
a list of affected articles
"""
#
# (C) Pywikibot team, 2007-2022
#
# Distributed under the terms of the MIT license.
#
import re
from functools import partial

import pywikibot
from pywikibot import i18n, pagegenerators, textlib
from pywikibot.bot import ExistingPageBot, SingleSiteBot
from pywikibot.exceptions import LockedPageError
from pywikibot.pagegenerators import XMLDumpPageGenerator
from kucosmetics import CANCEL, CosmeticChangesToolkit
import mwparserfromhell
import mytools


def reorder_templates_and_remove_categories(text):
    # Find and remove Koord or Coord templates
    koord_templates = find_koord_templates(text)
    text = remove_koord_templates(text)

    # Find and remove other templates
    template_regex = r'{{\s*([^\}]+\-şitil|[Şş]iti?l|[Kk]urt|[Ss]tub|[Şş]itlek|[^\}]+\-şitil\-[^\}]+)\s*}}'
    templates = re.findall(template_regex, text)
    text = re.sub(template_regex, '', text)

    # Find and remove other templates
    kontrol_regex = r'{{\s*([Kk]ontrola otorîteyê|[aA]uthority control|[Kk]ontrola otorîte)\s*}}'
    kontroloto = re.findall(kontrol_regex, text)
    text = re.sub(kontrol_regex, '', text)

    # Find and remove other templates
    template_sitil_regex = r'{{\s*([Şş]itil-[^\}]+)\s*}}'
    template_sitil = re.findall(template_sitil_regex, text)
    text = re.sub(template_sitil_regex, '', text)

    # Find and remove categories
    category_regex = r'\[\[Kategorî:[^\]]+\]\]'
    categories = re.findall(category_regex, text)
    text = re.sub(category_regex, '', text)

    # Find and remove DEFAULTSORT
    defaultsort_regex = r'{{\s*(DEFAULTSORT:[^}]+|Salê kat bike heke sal hebe)\s*}}'
    defaultsort = re.findall(defaultsort_regex, text)
    defaultsort = defaultsort[0] if defaultsort else ""  # Use an empty string if no DEFAULTSORT is found
    text = re.sub(defaultsort_regex, '', text)

    return text, koord_templates, templates, categories, defaultsort, template_sitil, kontroloto


def find_koord_templates(text):
    # Find Koord or Coord templates
    koord_regex = r'{{\s*([Kk]oord|[Cc]oord)\s*\|\s*([^}]+display\s*=\s*title)\s*}}'
    return [match[1] for match in re.findall(koord_regex, text)]


def remove_koord_templates(text):
    # Remove Koord or Coord templates
    koord_regex = r'{{\s*([Kk]oord|[Cc]oord)\s*\|\s*[^}]+display\s*=\s*title\s*}}'
    return re.sub(koord_regex, '', text)


def append_templates_and_categories(text, koord_templates, templates, categories, defaultsort, template_sitil,
                                    kontroloto):
    # Append other text, Koord or Coord templates, DEFAULTSORT, and categories in the specified order
    updated_text = text.rstrip('\n\n')

    if kontroloto and kontroloto != "":
        updated_text += "\n" + '\n'.join('{{' + kontrol + '}}' for kontrol in kontroloto)

    if template_sitil and template_sitil != "":
        updated_text += "\n" + '\n'.join('{{' + template_s + '}}' for template_s in template_sitil)

    if templates and templates != "":
        updated_text += "\n" + '\n'.join('{{' + template + '}}' for template in templates)

    if koord_templates and koord_templates != "":
        updated_text += "\n" + '\n'.join('{{Koord|' + koord_template + '}}' for koord_template in koord_templates)

    if defaultsort and defaultsort != "":
        updated_text += '\n' + '{{' + defaultsort + '}}'

    if categories and categories != "":
        updated_text += '\n\n' + '\n'.join(categories)

    # Remove empty lines at the end of the page
    updated_text = re.sub(r'\n\n\n*', '\n\n', updated_text)

    return updated_text


# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;': pagegenerators.parameterHelp,
}

# References sections are usually placed before further reading / external
# link sections. This dictionary defines these sections, sorted by priority.
# For example, on an English wiki, the script would place the "References"
# section in front of the "Further reading" section, if that existed.
# Otherwise, it would try to put it in front of the "External links" section,
# or if that fails, the "See also" section, etc.
placeBeforeSections = {
    'ku': [  # no explicit policy on where to put the references
        'Bîbliyografî',
        'Girêdanên derve',
    ]
}

# Titles of sections where a reference tag would fit into.
# The first title should be the preferred one: It's the one that will be
# used when a new section has to be created. Section titles can be regex
# patterns except of the first.
referencesSections = {
    'wikipedia': {
        'ku': [
            'Çavkanî',
        ]
    },
}

# Templates which include a <references /> tag. If there is no such template
# on your wiki, you don't have to enter anything here.
referencesTemplates = {
    'wikipedia': {
        'ku': ['Çavkanî', 'Reflist']
    },
}

# Text to be added instead of the <references /> tag.
# Define this only if required by your wiki.
referencesSubstitute = {
    'wikipedia': {
        'ku': '{{Çavkanî}}'
    },
}

# Sites where no title is required for references template
# as it is already included there
noTitleRequired = ['be', 'szl']

maintenance_category = 'Q6483427'

_ref_regex = re.compile('</ref>', re.IGNORECASE)
_references_regex = re.compile('<references.*?/>', re.IGNORECASE)


def _match_xml_page_text(text) -> bool:
    """Match page text."""
    text = textlib.removeDisabledParts(text)
    return _ref_regex.search(text) and not _references_regex.search(text)


XmlDumpNoReferencesPageGenerator = partial(
    XMLDumpPageGenerator, text_predicate=_match_xml_page_text)


class NoReferencesBot(SingleSiteBot, ExistingPageBot):
    """References section bot."""

    use_disambigs = False
    use_redirects = False

    def __init__(self, **kwargs) -> None:
        """Initializer."""
        self.available_options.update({
            'verbose': True,
            'ignore': CANCEL.MATCH,
        })
        super().__init__(**kwargs)
        self.botname = '[[Bikarhêner:Balyozxane/skrîpt/py/cavkanileke.py|Bot]]'
        self.refR = _ref_regex
        self.referencesR = _references_regex
        self.referencesTagR = re.compile('<references>.*?</references>',
                                         re.IGNORECASE | re.DOTALL)
        try:
            self.referencesTemplates = referencesTemplates[
                self.site.family.name][self.site.code]
        except KeyError:
            self.referencesTemplates = []
        try:
            self.referencesText = referencesSubstitute[
                self.site.family.name][self.site.code]
        except KeyError:
            self.referencesText = '<references />'

    def lacksReferences(self, text) -> bool:
        """Check whether or not the page is lacking a references tag."""

        oldTextCleaned = textlib.removeDisabledParts(text)

        if re.search(r"<references\s*?/>", oldTextCleaned):
            return True
        elif self.referencesR.search(oldTextCleaned) \
                or self.referencesTagR.search(oldTextCleaned):
            if self.opt.verbose:
                pywikibot.info('No changes necessary: references tag found.')
            return False

        if self.referencesTemplates:
            templateR = '{{(' + '|'.join(self.referencesTemplates) + ')'
            if re.search(templateR, oldTextCleaned, re.IGNORECASE):
                if self.opt.verbose:
                    pywikibot.info(
                        'No changes necessary: references template found.')
                return False


        parsed = mwparserfromhell.parse(oldTextCleaned)
        template_found = False
        templates_to_find = ["1928", "1946", "1968", 'Çk']

        for template_to_find in templates_to_find:
            for template in parsed.filter_templates(matches=lambda template: template.name.matches(template_to_find)):
                template_found = True

        if template_found:
            return True

        if not self.refR.search(oldTextCleaned):
            if self.opt.verbose:
                pywikibot.info('No changes necessary: no ref tags found.')
            return False

        if self.opt.verbose:
            pywikibot.info('Found ref without references.')
        return True

    def addReferences(self, oldText) -> str:
        """
        Add a references tag into an existing section where it fits into.

        If there is no such section, creates a new section containing
        the references tag. Also repair malformed references tags.
        Set the edit summary accordingly.

        :param oldText: page text to be modified
        :type oldText: str
        :return: The modified pagetext
        """

        if re.search(r"<references\s*?/>", oldText):
            self.comment = f'{self.botname}: Etîketa <references /> bi şablona {{{{[[Şablon:Çavkanî|Çavkanî]]}}}} hat guhartin.'

            newText = re.sub(r"<\s*?references\s*?/>", "{{Çavkanî}}", oldText)
            return newText

        parsed = mwparserfromhell.parse(oldText)
        substCk = False
        for template in parsed.filter_templates():
            template_name = mytools.ucfirst(template.name)
            if template_name == 'Çk':
                substCk = True
                template.name = 'subst:Çk'
        if substCk:
            cktemplate = '{{[[Şablon:Çk|Çk]]}}'
            cksubsttemp = '{{[[Şablon:Çk|subst:Çk]]}}'
            self.comment = f'{self.botname}: {cktemplate} kir {cksubsttemp}'
            return str(parsed)

        # Do we have a malformed <reference> tag which could be repaired?
        # Set the edit summary for this case
        self.comment = f'{self.botname}: Etîketa <references /> hat sererastkirin.'

        # Repair two opening tags or an opening and an empty tag
        pattern = re.compile(r'< *references *>(.*?)'
                             r'< */?\s*references */? *>', re.DOTALL)
        if pattern.search(oldText):
            pywikibot.info('Repairing references tag')
            return re.sub(pattern, r'<references>\1</references>', oldText)
        # Repair single unclosed references tag
        pattern = re.compile(r'< *references *>')
        if pattern.search(oldText):
            pywikibot.info('Repairing references tag')
            return re.sub(pattern, '<references />', oldText)

        # Is there an existing section where we can add the references tag?
        # Set the edit summary for this case
        self.comment = f'{self.botname}: Şablona {{{{[[Şablon:çavkanî|Çavkanî]]}} lê hat zêdekirin.'
        for section in i18n.translate(self.site, referencesSections):
            sectionR = re.compile(fr'\r?\n=+ *{section} *=+ *\r?\n')
            index = 0
            while index < len(oldText):
                match = sectionR.search(oldText, index)
                if match:
                    if textlib.isDisabled(oldText, match.start()):
                        pywikibot.info(f'Existing {section} section is '
                                       f'commented out, skipping.')
                        index = match.end()
                    else:
                        pywikibot.info(f'Adding references tag to existing'
                                       f' {section} section...\n')
                        templates_or_comments = re.compile(
                            r'^((?:\s*(?:\{\{[^\{\}]*?\}\}|<!--.*?-->))*)',
                            flags=re.DOTALL)
                        new_text = (
                                oldText[:match.end() - 1]
                                + templates_or_comments.sub(
                            fr'\n{self.referencesText}\1',
                            oldText[match.end() - 1:]))
                        return new_text
                else:
                    break

        # Create a new section for the references tag
        for section in i18n.translate(self.site, placeBeforeSections):
            # Find out where to place the new section
            sectionR = re.compile(r'\r?\n(?P<ident>=+) *{} *(?P=ident) *\r?\n'
                                  .format(section))
            index = 0
            while index < len(oldText):
                match = sectionR.search(oldText, index)
                if match:
                    if textlib.isDisabled(oldText, match.start()):
                        pywikibot.info(
                            'Existing {} section is commented out, '
                            "won't add the references in front of it."
                            .format(section))
                        index = match.end()
                    else:
                        pywikibot.info(
                            'Adding references section before {} section...\n'
                            .format(section))
                        index = match.start()
                        ident = match['ident']
                        return self.createReferenceSection(oldText, index,
                                                           ident)
                else:
                    break
        # This gets complicated: we want to place the new references
        # section over the interwiki links and categories, but also
        # over all navigation bars, persondata, and other templates
        # that are at the bottom of the page. So we need some advanced
        # regex magic.
        # The strategy is: create a temporary copy of the text. From that,
        # keep removing interwiki links, templates etc. from the bottom.
        # At the end, look at the length of the temp text. That's the position
        # where we'll insert the references section.
        catNamespaces = '|'.join(self.site.namespaces.CATEGORY)
        categoryPattern = fr'\[\[\s*({catNamespaces})\s*:[^\n]*\]\]\s*'
        interwikiPattern = r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]\s*'
        # won't work with nested templates
        # the negative lookahead assures that we'll match the last template
        # occurrence in the temp text.
        # FIXME:
        # {{commons}} or {{commonscat}} are part of Weblinks section
        # * {{template}} is mostly part of a section
        # so templatePattern must be fixed
        templatePattern = r'\r?\n{{((?!}}).)+?}}\s*'
        commentPattern = r'<!--((?!-->).)*?-->\s*'
        metadataR = re.compile(r'(\r?\n)?({}|{}|{}|{})$'
                               .format(categoryPattern, interwikiPattern,
                                       templatePattern, commentPattern),
                               re.DOTALL)
        tmpText = oldText
        while True:
            match = metadataR.search(tmpText)
            if match:
                tmpText = tmpText[:match.start()]
            else:
                break
        pywikibot.info(
            'Found no section that can be preceded by a new references '
            'section.\nPlacing it before interwiki links, categories, and '
            'bottom templates.')
        index = len(tmpText)
        return self.createReferenceSection(oldText, index)

    def createReferenceSection(self, oldText, index, ident: str = '==') -> str:
        """Create a reference section and insert it into the given text.

        :param oldText: page text that is going to be be amended
        :type oldText: str
        :param index: the index of oldText where the reference section should
            be inserted at
        :type index: int
        :param ident: symbols to be inserted before and after reference section
            title
        :return: the amended page text with reference section added
        """
        if self.site.code in noTitleRequired:
            ref_section = f'\n\n{self.referencesText}\n'
        else:
            ref_section = '\n\n{ident} {title} {ident}\n{text}\n'.format(
                title=i18n.translate(self.site, referencesSections)[0],
                ident=ident, text=self.referencesText)

        newText = oldText[:index].rstrip() + ref_section + oldText[index:]

        # Substituting multiple occurrences of "Çavkanî" with a single occurrence in the entire newText
        newText = re.sub(r'==\s*Çavkanî\s*==\s*==\s*Çavkanî\s*==', '== Çavkanî ==', newText)
        text, koord_templates, templates, categories, defaultsort, template_sitil, kontroloto = reorder_templates_and_remove_categories(
            newText)
        newText = append_templates_and_categories(text, koord_templates, templates, categories, defaultsort,
                                                  template_sitil, kontroloto)
        return newText

    def skip_page(self, page):
        """Check whether the page could be processed."""
        if super().skip_page(page):
            return True

        # Check if the page is in one of the specified categories
        categories_to_skip = [
            'Kategorî:Gotara bi kirmaşanî',
            'Kategorî:Gotara bi kurdiya başûr',
            'Kategorî:Gotara bi soranî',
            'Kategorî:Gotara bi zazakî'
        ]

        page_categories = [cat.title() for cat in page.categories()]

        if any(category in page_categories for category in categories_to_skip):
            pywikibot.warning(
                'Page {} is in a specified category. Skipping.'
                .format(page.title(as_link=True)))
            return True

        if self.site.sitename == 'wikipedia:en' and page.isIpEdit():
            pywikibot.warning(
                'Page {} is edited by IP. Possible vandalized'
                .format(page.title(as_link=True)))
            return True

        return False

    def do_kozmetik(self, old_text):
        kozmetik_cebu = ""
        cc_toolkit = CosmeticChangesToolkit(self.current_page,
                                            ignore=self.opt.ignore)
        new_text, summaries = cc_toolkit.change(old_text)
        applied_summaries = ', '.join(summaries.values())
        if new_text is not False and new_text != old_text:
            kozmetik_cebu = "; paqijiyên kozmetîk"
            if applied_summaries:
                kozmetik_cebu += f' ({applied_summaries}.)'

        return new_text, kozmetik_cebu


    def treat_page(self) -> None:
        """Run the bot."""
        page = self.current_page
        try:
            text = page.text
        except LockedPageError:
            pywikibot.warning('Page {} is locked?!'
                              .format(page.title(as_link=True)))
            return

        if self.lacksReferences(text):
            new_text = self.addReferences(text)
            cleaned_new_text, kozmetik_cebu = self.do_kozmetik(new_text)

            self.put_current(cleaned_new_text, summary=f"{self.comment}{kozmetik_cebu}")


def main(*args: str) -> None:
    """
    Process command line arguments and invoke bot.

    If args is an empty list, sys.argv is used.

    :param args: command line arguments
    """
    options = {}
    gen = None

    # Process global args and prepare generator args parser
    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        opt, _, value = arg.partition(':')
        if opt == '-xml':
            xmlFilename = value or i18n.input('pywikibot-enter-xml-filename')
            gen = XmlDumpNoReferencesPageGenerator(xmlFilename)
        elif opt == '-always':
            options['always'] = True
        elif opt == '-quiet':
            options['verbose'] = False
        elif opt == '-ignore':
            value = value.upper()
            try:
                options['ignore'] = getattr(CANCEL, value)
            except AttributeError:
                raise ValueError(f'Unknown ignore mode {value!r}!')
        else:
            genFactory.handle_arg(arg)

    gen = genFactory.getCombinedGenerator(gen, preload=True)
    if not gen:
        site = pywikibot.Site()
        cat = site.page_from_repository(maintenance_category)
        if cat:
            gen = cat.articles(namespaces=genFactory.namespaces or [0])

    if gen:
        bot = NoReferencesBot(generator=gen, **options)
        bot.run()
    else:
        pywikibot.bot.suggest_help(missing_generator=True)


if __name__ == '__main__':
    main()