Bikarhêner:Balyozxane/skrîpt/py/kuCosmeticsCore.py

"""
this is a fork of master cosmetic_changes.py which comes with pywikibot. The script can run standalone using [[Bikarhêner:Balyozxane/skrîpt/py/kuCosmeticsRun.py]] which doesn't edit the page if only whitespace changes are detected.

Changes:
fixArabicLetters removed
removeEmptySections disabled.
fixStyle enabled for ku. Changes prettytable to wikitable

Added:

fixVrefNames --> renames visual editor ref names.

replaceDeprecatedTemplates --> changes redirected templates to target template using a json file ([[Bikarhêner:Balyozxane/skrîpt/py/listeyasablonan.py]]).

fixOthers --> A few standard changes for ku.wiki

replaceDeprecatedParams --> Uses WP:AutoWikiBrowser/Rename template parameters to replace deprecated/English parameters

removeDupeCats --> Removes dublicate categories

fixAgahidankSpace --> standardizes the number of space characters between Agahîdank templates

removeSelfCat --> removes category from self-categoriezed cats
"""
import re
from contextlib import suppress
from enum import IntEnum
from typing import Any, Union
from urllib.parse import urlparse, urlunparse

import pywikibot
from pywikibot import textlib
from pywikibot.backports import Callable, Dict, List, Match, Pattern
from pywikibot.exceptions import InvalidTitleError
from pywikibot.textlib import (
    FILE_LINK_REGEX,
    MultiTemplateMatchBuilder,
    get_regexes,
)
from pywikibot.tools import first_lower, first_upper
from pywikibot.tools.chars import url2string

import json
try:
    import stdnum.isbn as stdnum_isbn
except ImportError:
    stdnum_isbn = None

import mwparserfromhell
import string

# Subpage templates. Must be in lower case,
# whereas subpage itself must be case sensitive
# This is also used by interwiki.py
# TODO: Maybe move it to family file and implement global instances
moved_links = {
    'ku': (['documentation', 'belgekirin'], '/belge'),
}


class CANCEL(IntEnum):

    """Cancel level to ignore exceptions.

    If an error occurred and either skips the page or the method
    or a single match. ALL raises the exception.

    .. versionadded:: 6.3
    """

    ALL = 0
    PAGE = 1
    METHOD = 2
    MATCH = 3


def _format_isbn_match(match: Match[str], strict: bool = True) -> str:
    """Helper function to validate and format a single matched ISBN."""
    if not stdnum_isbn:
        raise NotImplementedError(
            'ISBN functionality not available. Install stdnum package.')

    isbn = match['code']
    try:
        stdnum_isbn.validate(isbn)
    except stdnum_isbn.ValidationError as e:
        if strict:
            raise
        pywikibot.log(f'ISBN "{isbn}" validation error: {e}')
        return isbn

    return stdnum_isbn.format(isbn)


def _reformat_ISBNs(text: str, strict: bool = True) -> str:
    """Helper function to normalise ISBNs in text.

    :raises Exception: Invalid ISBN encountered when strict enabled
    """
    return textlib.reformat_ISBNs(
        text, lambda match: _format_isbn_match(match, strict=strict))


class CosmeticChangesToolkit:

    """Cosmetic changes toolkit.

    .. versionchanged:: 7.0
       `from_page()` method was removed
    """

    def __init__(self, page: 'pywikibot.page.BasePage', *,
                 show_diff: bool = False,
                 ignore: IntEnum = CANCEL.ALL) -> None:
        """Initializer.

        .. versionchanged:: 5.2
           instantiate the CosmeticChangesToolkit from a page object;
           only allow keyword arguments except for page parameter;
           `namespace` and `pageTitle` parameters are deprecated

        .. versionchanged:: 7.0
           `namespace` and `pageTitle` parameters were removed

        :param page: the Page object containing the text to be modified
        :param show_diff: show difference after replacements
        :param ignore: ignores if an error occurred and either skips the page
            or only that method. It can be set one of the CANCEL constants
        """
        self.site = page.site
        self.title = page.title()
        self.namespace = page.namespace()

        self.show_diff = show_diff
        self.template = (self.namespace == 10)
        self.talkpage = self.namespace >= 0 and self.namespace % 2 == 1
        self.ignore = ignore
        self.summaries = {}

        self.common_methods = [
            self.fixSelfInterwiki,
            self.standardizePageFooter,
            self.fixSyntaxSave,
            self.cleanUpLinks,
            self.cleanUpSectionHeaders,
            self.putSpacesInLists,
            self.translateAndCapitalizeNamespaces,
            self.translateMagicWords,
            self.replaceDeprecatedTemplates,
            self.resolveHtmlEntities,
            self.removeUselessSpaces,
            self.removeNonBreakingSpaceBeforePercent,

            self.fixHtml,
            self.fixReferences,
            self.fixVrefNames,
            self.fixStyle,
            self.fixTypo,
            self.fixOthers,
            self.replaceDeprecatedParams,
            self.removeDupeCats,
            self.fixAgahidankSpace,
            self.removeSelfCat
        ]
        if stdnum_isbn:
            self.common_methods.append(self.fix_ISBN)

    # Define the explanation for each method
    method_explanations = {
        'fixSelfInterwiki': 'Lînkên înterwîkî sererast kir',
        'fix_ISBN': 'ISBN sererast kir',
        'standardizePageFooter': 'Binê rûpelê standard kir',
        'fixSyntaxSave': 'Xeletiyên sentaksê sererast kir',
        'cleanUpLinks': 'Lînk paqij kir',
        'cleanUpSectionHeaders': 'Valahiya beşan sererast kir',
        'putSpacesInLists': 'Valahî li lîsteyan zêde kir',
        'translateAndCapitalizeNamespaces': 'Valahiya nav tercime û mezin kir',
        'translateMagicWords': 'Kelîmeyên sihirî tercime kir',
        'replaceDeprecatedTemplates': 'Şablonên beralîkirî guhart',
        'resolveHtmlEntities': 'HTML sererast kir',
        'removeUselessSpaces': 'Valahiyên ne hewce jê bir',
        'removeNonBreakingSpaceBeforePercent': 'Valahiya beriya sedî jê bir',
        'fixHtml': 'Xeletiyên HTMLê sererast kir',
        'fixReferences': 'Referans sererast kir',
        'fixVrefNames': 'Navên referansan sererast kir',
        'fixStyle': 'Xeletiyên stîlê sererat kir',
        'fixTypo': 'Xeletî sererast kir',
        'fixOthers': 'Sernavên beşan sererast kir',
        'replaceDeprecatedParams': 'Parametreyên îngilîzî/xelet sererast kir',
        'removeDupeCats': 'Kategoriya ducarî jê bir',
        'fixAgahidankSpace': 'Valahiya di agahîdankê de standard kir',
        'removeSelfCat': 'Kategoriya li ser xwe jê bir'
    }

    def safe_execute(self, method: Callable[[str], str], text: str) -> str:
        """Execute the method and catch exceptions if enabled."""
        result = None
        try:
            result = method(text)
        except Exception as e:
            if self.ignore == CANCEL.METHOD:
                pywikibot.warning('Unable to perform "{}" on "{}"!'
                                  .format(method.__name__, self.title))
                pywikibot.error(e)
            else:
                raise
        return text if result is None else result

    def _generate_summary(self, method_name: str, old_text: str, new_text: str) -> str:
        """Generate a summary for the change made by a method."""
        explanation = self.method_explanations.get(method_name, 'sererastkirinên din')
        return explanation

    def _check_modification(self, method_name: str, old_text: str, new_text: str) -> None:
        """Check if the text is modified by a method and generate a summary."""
        if old_text != new_text:
            summary = self._generate_summary(method_name, old_text, new_text)
            self.summaries[method_name] = summary

    def _change(self, text: str) -> str:
        """Execute all clean up methods."""
        modified_text = text
        for method in self.common_methods:
            old_text = modified_text
            modified_text = self.safe_execute(method, modified_text)
            self._check_modification(method.__name__, old_text, modified_text)
        return modified_text

    def change(self, text: str) -> Union[bool, str]:
        """Execute all clean up methods and catch errors if activated."""
        try:
            new_text = self._change(text)
        except Exception as e:
            if self.ignore == CANCEL.PAGE:
                pywikibot.warning('Skipped "{}", because an error occurred.'
                                  .format(self.title))
                pywikibot.error(e)
                return False
            raise
        else:
            if self.show_diff:
                pywikibot.showDiff(text, new_text)
            return new_text, self.summaries

    def fixOthers(self, text: str) -> str:
        replacements = {
            r'==\s*[gG]ir[eê]dan[aêîi]n?\s+[Dd]erv(a|eyî|ê)\s*==': '== Girêdanên derve ==',
            r'==\s*Erdn[îi]garî?\s*==': '== Erdnîgarî ==',
            r'==\s*[Çç]ava?kanî\s*==': '== Çavkanî ==',
            r'==\s*Tûrîzm\s*==': '== Turîzm ==',
            r'==\s*[bB]in[eê]r[eê] [Jj]î\s*==': '== Binêre herwiha =='
        }

        for pattern, replacement in replacements.items():
            text = re.sub(pattern, replacement, text)

        return text

    def removeSelfCat(self, text: str) -> str:
        if self.namespace != 14:
            return text

        category_links = textlib.getCategoryLinks(text, site=self.site)

        # Construct new category links without self.title while preserving sortkeys
        new_category_links = []
        for category in category_links:
            if category.title() != self.title:
                sortkey = category.sortKey
                if sortkey:
                    new_category_links.append(f"{category.title()}|{sortkey}")
                else:
                    new_category_links.append(category.title())

        # Replace existing categories with new category links
        text = textlib.replaceCategoryLinks(text, new_category_links, site=self.site)

        return text

    def removeDupeCats(self, text: str) -> str:

        category_pattern = r'\[\[Kategorî:(.*?)\]\]'
        categories = re.findall(category_pattern, text)

        unique_categories = []
        duplicate_categories = set()

        for category in categories:
            if category not in unique_categories:
                unique_categories.append(category)
            else:
                duplicate_categories.add(category)

        if duplicate_categories:
            new_text = text
            for category in duplicate_categories:
                # Find the first occurrence of the duplicate category
                index = new_text.find("[[Kategorî:{}]]".format(category))
                if index != -1:
                    # Find the next occurrence after the first one
                    next_index = new_text.find("[[Kategorî:{}]]".format(category), index + 1)
                    if next_index != -1:
                        # Remove the next occurrence
                        new_text = new_text[:next_index] + new_text[next_index:].replace(
                            "[[Kategorî:{}]]".format(category), '', 1)
                    else:
                        # If there's only one occurrence, remove it
                        new_text = new_text.replace("[[Kategorî:{}]]".format(category), '', 1)
            return new_text

    def replaceDeprecatedParams(self, text: str) -> str:

        with open('parambikejson.json', encoding='utf-8') as f:
            alias_dict = json.load(f)

        wikicode = mwparserfromhell.parse(text)

        for template in wikicode.filter_templates():
            template_name = template.name.strip()

            # Capitalize the first letter of template_name
            template_name = template_name[0].upper() + template_name[1:]

            # Check if the capitalized template name is in alias_dict
            if template_name in alias_dict:
                params_to_replace = alias_dict[template_name]

                # Loop through each parameter in the template
                for param in template.params:
                    param_name = param.name.strip()

                    # Check if the parameter name needs replacing
                    if param_name in params_to_replace:
                        new_param_name = params_to_replace[param_name]
                        param.name = new_param_name

        text = str(wikicode)
        return text

    def fixAgahidankSpace(self, text: str) -> str:
        wikicode = mwparserfromhell.parse(text)

        for template in wikicode.filter_templates():
            if template.name.strip().startswith("Agahîdank"):
                if template.params:
                    # Iterate over the parameters and format them
                    for param in template.params:
                        # Calculate space padding based on the length of the parameter name
                        param_name_length = len(param.name.strip())
                        if param_name_length <= 17:
                            space_padding = " " * (18 - param_name_length) + " "
                        else:
                            space_padding = " "

                        # Add a line break after each parameter value
                        param.value = " " + param.value.strip() + "\n"

                        # Update parameter name with padding
                        param.name = " {}{}".format(param.name.strip(), space_padding)

                    # Add a line break after the template name
                    template.name = template.name.strip() + "\n"
                else:
                    # Handle the case where there are no parameters in the template
                    pass

        return str(wikicode)

    def replaceDeprecatedTemplates(self, text: str) -> str:
        """
        Renames redirected templates from redirected_template_mappings.json for kuwiki
        """

        # Load JSON file containing template name mappings
        with open('redirected_template_mappings.json', encoding='utf-8') as f:
            template_mappings = json.load(f)

        wikicode = mwparserfromhell.parse(text)

        # Iterate over each template in the parsed text
        for template in wikicode.filter_templates():
            old_name = template.name.strip()
            old_name = old_name[0].upper() + old_name[1:]

            # Check if the template name exists in the JSON mappings
            if old_name in template_mappings:
                new_name = template_mappings[old_name]["rd_title"]  # Get the new template name
                # Find the position of the old template name in template.name
                start_index = template.name.lower().find(old_name.lower())

                # Replace the old template name with the new one in template.name
                template.name = (
                        template.name[:start_index] + new_name + template.name[start_index + len(old_name):]
                )

        # Convert the modified wikicode back to text
        text = str(wikicode)
        return text

    def fixVrefNames(self, text: str) -> str:
        """
        taken from [[:en:User:Qwerfjkl/VEref.py]] which is itself taken
        from [[:en:User:Psiĥedelisto/VisualEditor ref namer.py]]
        The VisualEditor, (very annoyingly!), doesn't name references added by users, and gives them names like :0, :1, etc. This script fixes that automatically
        Changes some lower case template names to upper and vice versa
        """

        parsed = mwparserfromhell.parse( text )
        tags = list(filter(None, [t if t.has("name") else None for t in parsed.ifilter(forcetype=mwparserfromhell.wikicode.Tag, matches="<\\s*ref\\s*", recursive=True)]))

        refs = list(filter(lambda s: re.search("^:\d+$", str(s.get("name").value)) and not re.search("/>$", str(s)), tags))

        pretty = dict()

        for ref in refs:
            template = ref.contents.get(0)
            if not isinstance(template, mwparserfromhell.nodes.Template):  # Check if template is a Template object
                continue
            if template.has("vauthors"):
                v = str(template.get("vauthors").value)
            elif template.has("authors"):
                v = str(template.get("authors").value)
            elif template.has("paşnav"):
                v = str(template.get("paşnav").value)
            elif template.has("pêşnav"):
                v = str(template.get("pêşnav").value)
            else:
                continue

            v = v.strip()

            if "," in v:
                last = v[:v.index(",")]
            elif " " in v:
                last = v[:v.index(" ")]
            else:
                last = v

            punctuation = set(string.punctuation)

            # Strip punctuation characters from the last word directly
            last = ''.join([char for char in last if char not in punctuation])

            if re.match(r'^[0-9\-.,]+$', last):
                last = False
            else:
                # Check if the last name contains Latin alphabet characters
                latin_alphabet = set(string.ascii_letters)
                if not any(char in latin_alphabet for char in last):
                    last = False

            date = False
            if template.has("tarîx"):
                date = str(template.get("tarîx").value)
            elif template.has("dîrok"):
                date = str(template.get("dîrok").value)
            elif template.has("sal"):
                date = str(template.get("sal").value)

            if date and last:
                match = re.search("\d{4}", date)
                if match:
                    date = match[0]
                    pretty[str(ref.get("name").value)] = "{}{}".format(last, date)

        for tag in parsed.ifilter(forcetype=mwparserfromhell.wikicode.Tag, matches="<\\s*ref\\s*", recursive=True):
            if not tag.has("name"): continue
            k = str(tag.get("name").value)
            if k in pretty:
                tag.attributes[0].value = pretty[k]

        text = str(parsed)
        return text

    def fixSelfInterwiki(self, text: str) -> str:
        """
        Interwiki links to the site itself are displayed like local links.

        Remove their language code prefix.
        """
        if not self.talkpage and pywikibot.calledModuleName() != 'interwiki':
            interwikiR = re.compile(r'\[\[(?: *:)? *{} *: *([^\[\]\n]*)\]\]'
                                    .format(self.site.code))
            text = interwikiR.sub(r'[[\1]]', text)
        return text

    def standardizePageFooter(self, text: str) -> str:
        """
        Standardize page footer.

        Makes sure that interwiki links and categories are put
        into the correct position and into the right order. This
        combines the old instances of standardizeInterwiki
        and standardizeCategories.

        The page footer consists of the following parts
        in that sequence:
        1. categories
        2. additional information depending on the local site policy
        3. interwiki
        """
        assert self.title is not None

        categories = []
        interwiki_links = {}

        # get categories
        if not self.template:
            categories = textlib.getCategoryLinks(text, site=self.site)

        if not self.talkpage:
            subpage = False
            if self.template:
                try:
                    tmpl, loc = moved_links[self.site.code]
                    del tmpl
                except KeyError:
                    loc = None
                if loc is not None and loc in self.title:
                    subpage = True

            # get interwiki
            interwiki_links = textlib.getLanguageLinks(
                text, insite=self.site, template_subpage=subpage)

            # remove interwiki
            text = textlib.removeLanguageLinks(text, site=self.site)

        # add categories, main to top
        if categories:
            # TODO: Sort categories in alphabetic order, e.g. using
            # categories.sort()? (T100265)
            # TODO: Get main categories from Wikidata?
            main = pywikibot.Category(self.site, 'Category:' + self.title,
                                      sort_key=' ')
            if main in categories:
                categories.pop(categories.index(main))
                categories.insert(0, main)
            text = textlib.replaceCategoryLinks(text, categories,
                                                site=self.site)

        # add interwiki
        if interwiki_links:
            text = textlib.replaceLanguageLinks(text, interwiki_links,
                                                site=self.site,
                                                template=self.template,
                                                template_subpage=subpage)

        return text

    def translateAndCapitalizeNamespaces(self, text: str) -> str:
        """Use localized namespace names.

        .. versionchanged:: 7.4
           No longer expect a specific namespace alias for File:
        """
        # arz uses English stylish codes
        if self.site.sitename == 'wikipedia:arz':
            return text
        # wiki links aren't parsed here.
        exceptions = ['nowiki', 'comment', 'math', 'pre']

        for namespace in self.site.namespaces.values():
            if namespace == 0:
                # skip main (article) namespace
                continue
            # a clone is needed. Won't change the namespace dict
            namespaces = list(namespace)
            if namespace == 6 and self.site.family.name == 'wikipedia':
                if self.site.code in ('en', 'fr'):
                    # do not change "Image" on en-wiki and fr-wiki
                    with suppress(ValueError):
                        namespaces.remove('Image')
                if self.site.code == 'hu':
                    # do not change "Kép" on hu-wiki
                    with suppress(ValueError):
                        namespaces.remove('Kép')
                elif self.site.code == 'pt':
                    # use "Imagem" by default on pt-wiki (per T57242)
                    with suppress(ValueError):
                        namespaces.insert(
                            0, namespaces.pop(namespaces.index('Imagem')))
            # final namespace variant
            final_ns = namespaces.pop(0)
            if namespace in (2, 3):
                # skip localized user namespace, maybe gender is used
                namespaces = ['User' if namespace == 2 else 'User talk']
            # lowerspaced and underscored namespaces
            for i, item in enumerate(namespaces):
                item = item.replace(' ', '[ _]')
                item = f'[{item[0]}{item[0].lower()}]' + item[1:]
                namespaces[i] = item
            namespaces.append(first_lower(final_ns))
            if final_ns and namespaces:
                if self.site.sitename == 'wikipedia:pt' and namespace == 6:
                    # only change on these file extensions (per T57242)
                    extensions = ('png', 'gif', 'jpg', 'jpeg', 'svg', 'tiff',
                                  'tif')
                    text = textlib.replaceExcept(
                        text,
                        r'\[\[\s*({}) *:(?P<name>[^\|\]]*?\.({}))'
                        r'(?P<label>.*?)\]\]'
                        .format('|'.join(namespaces), '|'.join(extensions)),
                        fr'[[{final_ns}:\g<name>\g<label>]]',
                        exceptions)
                else:
                    text = textlib.replaceExcept(
                        text,
                        r'\[\[\s*({}) *:(?P<nameAndLabel>.*?)\]\]'
                        .format('|'.join(namespaces)),
                        fr'[[{final_ns}:\g<nameAndLabel>]]',
                        exceptions)
        return text

    def translateMagicWords(self, text: str) -> str:
        """Use localized magic words."""
        # not wanted at ru
        # arz uses English stylish codes
        # no need to run on English wikis
        if self.site.code in ['arz', 'en', 'ru']:
            return text

        def init_cache() -> None:
            for magicword in ('img_thumbnail', 'img_left', 'img_center',
                              'img_right', 'img_none', 'img_framed',
                              'img_frameless', 'img_border', 'img_upright',
                              'img_baseline', 'img_sub', 'img_super',
                              'img_top', 'img_text_top', 'img_middle',
                              'img_bottom', 'img_text_bottom'):
                aliases = self.site.getmagicwords(magicword)
                if len(aliases) > 1:
                    cache.update((alias, aliases[0]) for alias in aliases[1:]
                                 if '$1' not in alias)
            if not cache:
                cache[False] = True  # signal there is nothing to replace

        def replace_magicword(match: Match[str]) -> str:
            if cache.get(False):
                return match.group()
            split = match.group().split('|')
            if len(split) == 1:
                return match.group()

            if not cache:
                init_cache()

            # push ']]' out and re-add below
            split[-1] = split[-1][:-2]
            return '{}|{}]]'.format(
                split[0], '|'.join(cache.get(x.strip(), x) for x in split[1:]))

        cache: Dict[Union[bool, str], Any] = {}
        exceptions = ['comment', 'nowiki', 'pre', 'syntaxhighlight']
        regex = re.compile(
            FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]),
            flags=re.X)
        return textlib.replaceExcept(
            text, regex, replace_magicword, exceptions)

    def cleanUpLinks(self, text: str) -> str:
        """Tidy up wikilinks found in a string.

        This function will:

        * Replace underscores with spaces
        * Move leading and trailing spaces out of the wikilink and into the
          surrounding text
        * Convert URL-encoded characters into Unicode-encoded characters
        * Move trailing characters out of the link and make the link without
          using a pipe, if possible
        * Capitalize the article title of the link, if appropriate

        .. versionchanged:: 8.4
           Convert URL-encoded characters if a link is an interwiki link
           or different from main namespace.

        :param text: string to perform the clean-up on
        :return: text with tidied wikilinks
        """
        # helper function which works on one link and either returns it
        # unmodified, or returns a replacement.
        def handleOneLink(match: Match[str]) -> str:
            # Convert URL-encoded characters to str
            titleWithSection = url2string(match['titleWithSection'],
                                          encodings=self.site.encodings())
            label = match['label']
            trailingChars = match['linktrail']
            newline = match['newline']
            # entire link but convert URL-encoded text
            oldlink = url2string(match.group(),
                                 encodings=self.site.encodings())

            is_interwiki = self.site.isInterwikiLink(titleWithSection)
            if is_interwiki:
                return oldlink

            # The link looks like this:
            # [[page_title|link_text]]trailing_chars
            # We only work on namespace 0 because pipes and linktrails work
            # differently for images and categories.
            page = pywikibot.Page(pywikibot.Link(titleWithSection, self.site))
            try:
                in_main_namespace = page.namespace() == 0
            except InvalidTitleError:
                in_main_namespace = False
            if not in_main_namespace:
                return oldlink

            # Replace underlines by spaces, also multiple underlines
            titleWithSection = re.sub('_+', ' ', titleWithSection)
            # Remove double spaces
            titleWithSection = re.sub('  +', ' ', titleWithSection)
            # Remove unnecessary leading spaces from title,
            # but remember if we did this because we eventually want
            # to re-add it outside of the link later.
            titleLength = len(titleWithSection)
            titleWithSection = titleWithSection.lstrip()
            hadLeadingSpaces = len(titleWithSection) != titleLength
            hadTrailingSpaces = False
            # Remove unnecessary trailing spaces from title,
            # but remember if we did this because it may affect
            # the linktrail and because we eventually want to
            # re-add it outside of the link later.
            if not trailingChars:
                titleLength = len(titleWithSection)
                titleWithSection = titleWithSection.rstrip()
                hadTrailingSpaces = len(titleWithSection) != titleLength

            if not titleWithSection:
                # just skip empty links.
                return match.group()

            # Remove unnecessary initial and final spaces from label.
            # Please note that some editors prefer spaces around pipes.
            # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
            if label is not None:
                # Remove unnecessary leading spaces from label,
                # but remember if we did this because we want
                # to re-add it outside of the link later.
                labelLength = len(label)
                label = label.lstrip()
                hadLeadingSpaces = len(label) != labelLength
                # Remove unnecessary trailing spaces from label,
                # but remember if we did this because it affects
                # the linktrail.
                if not trailingChars:
                    labelLength = len(label)
                    label = label.rstrip()
                    hadTrailingSpaces = len(label) != labelLength
            else:
                label = titleWithSection
            if trailingChars:
                label += trailingChars

            if self.site.siteinfo['case'] == 'first-letter':
                firstcase_title = first_lower(titleWithSection)
                firstcase_label = first_lower(label)
            else:
                firstcase_title = titleWithSection
                firstcase_label = label

            if firstcase_label == firstcase_title:
                newLink = f'[[{label}]]'
            # Check if we can create a link with trailing characters
            # instead of a pipelink
            elif (firstcase_label.startswith(firstcase_title)
                  and trailR.sub('', label[len(titleWithSection):]) == ''):
                newLink = '[[{}]]{}'.format(label[:len(titleWithSection)],
                                            label[len(titleWithSection):])

            else:
                # Try to capitalize the first letter of the title.
                # Not useful for languages that don't capitalize nouns.
                # TODO: Add a configuration variable for each site,
                # which determines if the link target is written in
                # uppercase
                if self.site.sitename == 'wikipedia:de':
                    titleWithSection = first_upper(titleWithSection)
                newLink = f'[[{titleWithSection}|{label}]]'
            # re-add spaces that were pulled out of the link.
            # Examples:
            #   text[[ title ]]text        -> text [[title]] text
            #   text[[ title | name ]]text -> text [[title|name]] text
            #   text[[ title |name]]text   -> text[[title|name]]text
            #   text[[title| name]]text    -> text [[title|name]]text
            if hadLeadingSpaces and not newline:
                newLink = ' ' + newLink
            if hadTrailingSpaces:
                newLink += ' '
            if newline:
                newLink = newline + newLink
            return newLink

        trailR = re.compile(self.site.linktrail())
    # The regular expression which finds links. Results consist of four groups:
    # group <newline> depends whether the links starts with a new line.
    # group <titleWithSection> is the page title and section, that is,
    # everything before | or ]. It'll include the # to make life easier for us.
    # group <label> is the alternative link title between | and ].
    # group <linktrail> is the link trail after ]] which are part of the word.
    # note that the definition of 'letter' varies from language to language.
        linkR = re.compile(
            r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)'
            r'(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>'
            + self.site.linktrail() + ')')

        text = textlib.replaceExcept(text, linkR, handleOneLink,
                                     ['comment', 'math', 'nowiki', 'pre',
                                      'startspace'])
        return text

    def resolveHtmlEntities(self, text: str) -> str:
        """Replace HTML entities with string."""
        ignore = [
            38,     # Ampersand (&amp;)
            39,     # Single quotation mark (&quot;) per T26093
            60,     # Less than (&lt;)
            62,     # Greater than (&gt;)
            91,     # Opening square bracket ([)
                    # - sometimes used intentionally inside links
            93,     # Closing square bracket (])
                    # - used intentionally inside links
            124,    # Vertical bar (|)
                    # - used intentionally in navigation bar templates on w:de
            160,    # Non-breaking space (&nbsp;)
                    # - not supported by Firefox textareas
            173,    # Soft-hypen (&shy;) - enable editing
            8206,   # Left-to-right mark (&ltr;)
            8207,   # Right-to-left mark (&rtl;)
        ]
        if self.template:
            ignore.append(32)  # Space ( )
            ignore.append(58)  # Colon (:)
        # TODO: T254350 - what other extension tags should be avoided?
        # (graph, math, score, timeline, etc.)
        text = pywikibot.html2unicode(
            text, ignore=ignore, exceptions=['comment', 'syntaxhighlight'])
        return text

    def removeUselessSpaces(self, text: str) -> str:
        """Cleanup multiple or trailing spaces."""
        exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight',
                      'startspace', 'table']
        if self.site.sitename != 'wikipedia:cs':
            exceptions.append('template')
        text = textlib.replaceExcept(text, r'(?m)[\t ]+( |$)', r'\1',
                                     exceptions, site=self.site)
        text = re.sub(r'\n\n\n*', '\n\n', text)
        return text


    def removeNonBreakingSpaceBeforePercent(self, text: str) -> str:
        """
        Remove a non-breaking space between number and percent sign.

        Newer MediaWiki versions automatically place a non-breaking space in
        front of a percent sign, so it is no longer required to place it
        manually.
        """
        text = textlib.replaceExcept(
            text, r'(\d)&(?:nbsp|#160|#x[Aa]0);%', r'\1 %', ['timeline'])
        return text

    def cleanUpSectionHeaders(self, text: str) -> str:
        """
        Add a space between the equal signs and the section title.

        Example::

            ==Section title==

        becomes::

        == Section title ==

        .. note:: This space is recommended in the syntax help on the
           English and German Wikipedias. It is not wanted on Lojban and
           English Wiktionaries (:phab:`T168399`, :phab:`T169064`) and
           it might be that it is not wanted on other wikis. If there
           are any complaints, please file a bug report.
        """
        if self.site.sitename in ['wiktionary:jbo', 'wiktionary:en']:
            return text
        return textlib.replaceExcept(
            text,
            r'(?m)^(={1,6})[ \t]*(?P<title>.*[^\s=])[ \t]*\1[ \t]*\r?\n',
            r'\1 \g<title> \1\n',
            ['comment', 'math', 'nowiki', 'pre'])

    def putSpacesInLists(self, text: str) -> str:
        """
        Add a space between the * or # and the text.

        .. note:: This space is recommended in the syntax help on the
           English, German and French Wikipedias. It might be that it
           is not wanted on other wikis. If there are any complaints,
           please file a bug report.
        """
        if not self.template:
            exceptions = ['comment', 'math', 'nowiki', 'pre',
                          'syntaxhighlight', 'template', 'timeline',
                          self.site.redirect_regex]
            text = textlib.replaceExcept(
                text,
                r'(?m)'
                r'^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)',
                r'\g<bullet> \g<char>',
                exceptions)
        return text

    # from fixes.py
    def fixSyntaxSave(self, text: str) -> str:
        """Convert weblinks to wikilink, fix link syntax."""
        def replace_link(match: Match[str]) -> str:
            """Create a string to replace a single link."""
            replacement = '[['
            if re.match(r'(?:{}):'
                        .format('|'.join((*self.site.namespaces[6],
                                          *self.site.namespaces[14]))),
                        match['link']):
                replacement += ':'

            replacement += match['link']
            if match['title']:
                replacement += '|' + match['title']

            return replacement + ']]'

        exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
                      'syntaxhighlight']

        # link to the wiki working on
        # Only use suffixes for article paths
        for suffix in self.site._interwiki_urls(True):
            http_url = self.site.base_url(suffix, 'http')
            if self.site.protocol() == 'http':
                https_url = None
            else:
                https_url = self.site.base_url(suffix, 'https')

            # compare strings without the protocol, if they are empty support
            # also no prefix (//en.wikipedia.org/…)
            http = urlparse(http_url)
            https = urlparse(https_url)
            if https_url is not None and http.netloc == https.netloc:
                urls = ['(?:https?:)?'
                        + re.escape(urlunparse(('', *http[1:])))]
            else:
                urls = [re.escape(url) for url in (http_url, https_url)
                        if url is not None]

            for url in urls:
                # unescape {} placeholder
                url = url.replace(r'\{\}', '{title}')

                # Only include links which don't include the separator
                # as the wikilink won't support additional parameters
                separator = '?&' if '?' in suffix else '?'

                # Match first a non space in the title to prevent that multiple
                # spaces at the end without title will be matched by it
                title_regex = (r'(?P<link>[^{sep}]+?)'
                               r'(\s+(?P<title>[^\s].*?))'
                               .format(sep=separator))
                url_regex = fr'\[\[?{url}?\s*\]\]?'
                text = textlib.replaceExcept(
                    text,
                    url_regex.format(title=title_regex),
                    replace_link, exceptions, site=self.site)

        # external link in/starting with double brackets
        text = textlib.replaceExcept(
            text,
            r'\[\[(?P<url>https?://[^\]]+?)\]\]?',
            r'[\g<url>]', exceptions, site=self.site)

        # external link and description separated by a pipe, with
        # whitespace in front of the pipe, so that it is clear that
        # the dash is not a legitimate part of the URL.
        text = textlib.replaceExcept(
            text,
            r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]',
            r'[\g<url> \g<label>]', exceptions)

        # dash in external link, where the correct end of the URL can
        # be detected from the file extension. It is very unlikely that
        # this will cause mistakes.
        extensions = [fr'\.{ext}'
                      for ext in ['pdf', 'html?', 'php', 'aspx?', 'jsp']]
        text = textlib.replaceExcept(
            text,
            r'\[(?P<url>https?://[^\|\] ]+?(' + '|'.join(extensions) + r')) *'
            r'\| *(?P<label>[^\|\]]+?)\]',
            r'[\g<url> \g<label>]', exceptions)
        return text

    def fixHtml(self, text: str) -> str:
        """Relace html markups with wikitext markups."""
        def replace_header(match: Match[str]) -> str:
            """Create a header string for replacing."""
            depth = int(match[1])
            return r'{0} {1} {0}'.format('=' * depth, match[2])

        # Everything case-insensitive (?i)
        # Keep in mind that MediaWiki automatically converts <br> to <br />
        exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
                      'syntaxhighlight']
        text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>',
                                     r"'''\2'''", exceptions, site=self.site)
        text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>',
                                     r"''\2''", exceptions, site=self.site)
        # horizontal line without attributes in a single line
        text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])',
                                     r'\1----\2', exceptions)
        # horizontal line with attributes; can't be done with wiki syntax
        # so we only make it XHTML compliant
        text = textlib.replaceExcept(text, r'(?i)<hr ([^>/]+?)>',
                                     r'<hr \1 />',
                                     exceptions)
        # a header where only spaces are in the same line
        text = textlib.replaceExcept(
            text,
            r'(?i)(?<=[\r\n]) *<h([1-7])> *([^<]+?) *</h\1> *(?=[\r\n])',
            replace_header,
            exceptions)
        # TODO: maybe we can make the bot replace <p> tags with \r\n's.
        return text

    def fixReferences(self, text: str) -> str:
        """Fix references tags."""
        # See also
        # https://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm
        exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight',
                      'startspace']

        # it should be name = " or name=" NOT name   ="
        text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text)
        # remove empty <ref/>-tag
        text = textlib.replaceExcept(text,
                                     r'(?i)(<ref\s*/>|<ref *>\s*</ref>)',
                                     r'', exceptions)
        text = textlib.replaceExcept(text,
                                     r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>',
                                     r'<ref \1/>', exceptions)
        return text

    def fixStyle(self, text: str) -> str:
        """Convert prettytable to wikitable class."""
        exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
                      'syntaxhighlight']
        if self.site.code in ('de', 'en', 'ku'):
            text = textlib.replaceExcept(text,
                                         r'(class="[^"]*)prettytable([^"]*")',
                                         r'\1wikitable\2', exceptions)
        return text

    def fixTypo(self, text: str) -> str:
        """Fix units."""
        exceptions: List[Union[str, Pattern[str]]] = [
            'comment',
            'gallery',
            'hyperlink',
            'interwiki',
            'link',
            'nowiki',
            'math',
            'pre',
            'startspace',
            'syntaxhighlight',
        ]

        # change <number> ccm -> <number> cm³
        text = textlib.replaceExcept(text, r'(\d)\s*(?:&nbsp;)?ccm',
                                     r'\1&nbsp;cm³', exceptions,
                                     site=self.site)
        # Solve wrong Nº sign with °C or °F
        # additional exception requested on fr-wiki for this stuff
        pattern = re.compile('«.*?»')
        exceptions.append(pattern)
        text = textlib.replaceExcept(text, r'(\d)\s*(?:&nbsp;)?[º°]([CF])',
                                     r'\1&nbsp;°\2', exceptions,
                                     site=self.site)
        text = textlib.replaceExcept(text, 'º([CF])', '°' + r'\1',
                                     exceptions,
                                     site=self.site)
        return text

    def fix_ISBN(self, text: str) -> str:
        """Hyphenate ISBN numbers."""
        return _reformat_ISBNs(text, strict=self.ignore != CANCEL.MATCH)