Bikarhêner:Balyozxane/skrîpt/py/cavkanileke.py
Xuyakirin
#!/usr/bin/python3
"""
Fork of [[:mw:Manual:Pywikibot/noreferences.py]]
This script adds a missing references section to pages.
It goes over multiple pages, searches for pages where <references />
is missing although a <ref> tag is present, and in that case adds a new
references section.
These command line parameters can be used to specify which pages to work on:
¶ms;
Furthermore, the following command line parameters are supported:
-xml Retrieve information from a local XML dump (pages-articles
or pages-meta-current, see https://dumps.wikimedia.org).
Argument can also be given as "-xml:filename".
-always Don't prompt you for each replacement.
-quiet Use this option to get less output
If neither a page title nor a page generator is given, it takes all pages from
the default maintenance category.
It is strongly recommended not to run this script over the entire article
namespace (using the -start) parameter, as that would consume too much
bandwidth. Instead, use the -xml parameter, or use another way to generate
a list of affected articles
"""
#
# (C) Pywikibot team, 2007-2022
#
# Distributed under the terms of the MIT license.
#
import re
from functools import partial
import pywikibot
from pywikibot import i18n, pagegenerators, textlib
from pywikibot.bot import ExistingPageBot, SingleSiteBot
from pywikibot.exceptions import LockedPageError
from pywikibot.pagegenerators import XMLDumpPageGenerator
from kucosmetics import CANCEL, CosmeticChangesToolkit
import mwparserfromhell
import mytools
def reorder_templates_and_remove_categories(text):
# Find and remove Koord or Coord templates
koord_templates = find_koord_templates(text)
text = remove_koord_templates(text)
# Find and remove other templates
template_regex = r'{{\s*([^\}]+\-şitil|[Şş]iti?l|[Kk]urt|[Ss]tub|[Şş]itlek|[^\}]+\-şitil\-[^\}]+)\s*}}'
templates = re.findall(template_regex, text)
text = re.sub(template_regex, '', text)
# Find and remove other templates
kontrol_regex = r'{{\s*([Kk]ontrola otorîteyê|[aA]uthority control|[Kk]ontrola otorîte)\s*}}'
kontroloto = re.findall(kontrol_regex, text)
text = re.sub(kontrol_regex, '', text)
# Find and remove other templates
template_sitil_regex = r'{{\s*([Şş]itil-[^\}]+)\s*}}'
template_sitil = re.findall(template_sitil_regex, text)
text = re.sub(template_sitil_regex, '', text)
# Find and remove categories
category_regex = r'\[\[Kategorî:[^\]]+\]\]'
categories = re.findall(category_regex, text)
text = re.sub(category_regex, '', text)
# Find and remove DEFAULTSORT
defaultsort_regex = r'{{\s*(DEFAULTSORT:[^}]+|Salê kat bike heke sal hebe)\s*}}'
defaultsort = re.findall(defaultsort_regex, text)
defaultsort = defaultsort[0] if defaultsort else "" # Use an empty string if no DEFAULTSORT is found
text = re.sub(defaultsort_regex, '', text)
return text, koord_templates, templates, categories, defaultsort, template_sitil, kontroloto
def find_koord_templates(text):
# Find Koord or Coord templates
koord_regex = r'{{\s*([Kk]oord|[Cc]oord)\s*\|\s*([^}]+display\s*=\s*title)\s*}}'
return [match[1] for match in re.findall(koord_regex, text)]
def remove_koord_templates(text):
# Remove Koord or Coord templates
koord_regex = r'{{\s*([Kk]oord|[Cc]oord)\s*\|\s*[^}]+display\s*=\s*title\s*}}'
return re.sub(koord_regex, '', text)
def append_templates_and_categories(text, koord_templates, templates, categories, defaultsort, template_sitil,
kontroloto):
# Append other text, Koord or Coord templates, DEFAULTSORT, and categories in the specified order
updated_text = text.rstrip('\n\n')
if kontroloto and kontroloto != "":
updated_text += "\n" + '\n'.join('{{' + kontrol + '}}' for kontrol in kontroloto)
if template_sitil and template_sitil != "":
updated_text += "\n" + '\n'.join('{{' + template_s + '}}' for template_s in template_sitil)
if templates and templates != "":
updated_text += "\n" + '\n'.join('{{' + template + '}}' for template in templates)
if koord_templates and koord_templates != "":
updated_text += "\n" + '\n'.join('{{Koord|' + koord_template + '}}' for koord_template in koord_templates)
if defaultsort and defaultsort != "":
updated_text += '\n' + '{{' + defaultsort + '}}'
if categories and categories != "":
updated_text += '\n\n' + '\n'.join(categories)
# Remove empty lines at the end of the page
updated_text = re.sub(r'\n\n\n*', '\n\n', updated_text)
return updated_text
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp,
}
# References sections are usually placed before further reading / external
# link sections. This dictionary defines these sections, sorted by priority.
# For example, on an English wiki, the script would place the "References"
# section in front of the "Further reading" section, if that existed.
# Otherwise, it would try to put it in front of the "External links" section,
# or if that fails, the "See also" section, etc.
placeBeforeSections = {
'ku': [ # no explicit policy on where to put the references
'Bîbliyografî',
'Girêdanên derve',
]
}
# Titles of sections where a reference tag would fit into.
# The first title should be the preferred one: It's the one that will be
# used when a new section has to be created. Section titles can be regex
# patterns except of the first.
referencesSections = {
'wikipedia': {
'ku': [
'Çavkanî',
]
},
}
# Templates which include a <references /> tag. If there is no such template
# on your wiki, you don't have to enter anything here.
referencesTemplates = {
'wikipedia': {
'ku': ['Çavkanî', 'Reflist']
},
}
# Text to be added instead of the <references /> tag.
# Define this only if required by your wiki.
referencesSubstitute = {
'wikipedia': {
'ku': '{{Çavkanî}}'
},
}
# Sites where no title is required for references template
# as it is already included there
noTitleRequired = ['be', 'szl']
maintenance_category = 'Q6483427'
_ref_regex = re.compile('</ref>', re.IGNORECASE)
_references_regex = re.compile('<references.*?/>', re.IGNORECASE)
def _match_xml_page_text(text) -> bool:
"""Match page text."""
text = textlib.removeDisabledParts(text)
return _ref_regex.search(text) and not _references_regex.search(text)
XmlDumpNoReferencesPageGenerator = partial(
XMLDumpPageGenerator, text_predicate=_match_xml_page_text)
class NoReferencesBot(SingleSiteBot, ExistingPageBot):
"""References section bot."""
use_disambigs = False
use_redirects = False
def __init__(self, **kwargs) -> None:
"""Initializer."""
self.available_options.update({
'verbose': True,
'ignore': CANCEL.MATCH,
})
super().__init__(**kwargs)
self.botname = '[[Bikarhêner:Balyozxane/skrîpt/py/cavkanileke.py|Bot]]'
self.refR = _ref_regex
self.referencesR = _references_regex
self.referencesTagR = re.compile('<references>.*?</references>',
re.IGNORECASE | re.DOTALL)
try:
self.referencesTemplates = referencesTemplates[
self.site.family.name][self.site.code]
except KeyError:
self.referencesTemplates = []
try:
self.referencesText = referencesSubstitute[
self.site.family.name][self.site.code]
except KeyError:
self.referencesText = '<references />'
def lacksReferences(self, text) -> bool:
"""Check whether or not the page is lacking a references tag."""
oldTextCleaned = textlib.removeDisabledParts(text)
if re.search(r"<references\s*?/>", oldTextCleaned):
return True
elif self.referencesR.search(oldTextCleaned) \
or self.referencesTagR.search(oldTextCleaned):
if self.opt.verbose:
pywikibot.info('No changes necessary: references tag found.')
return False
if self.referencesTemplates:
templateR = '{{(' + '|'.join(self.referencesTemplates) + ')'
if re.search(templateR, oldTextCleaned, re.IGNORECASE):
if self.opt.verbose:
pywikibot.info(
'No changes necessary: references template found.')
return False
parsed = mwparserfromhell.parse(oldTextCleaned)
template_found = False
templates_to_find = ["1928", "1946", "1968", 'Çk']
for template_to_find in templates_to_find:
for template in parsed.filter_templates(matches=lambda template: template.name.matches(template_to_find)):
template_found = True
if template_found:
return True
if not self.refR.search(oldTextCleaned):
if self.opt.verbose:
pywikibot.info('No changes necessary: no ref tags found.')
return False
if self.opt.verbose:
pywikibot.info('Found ref without references.')
return True
def addReferences(self, oldText) -> str:
"""
Add a references tag into an existing section where it fits into.
If there is no such section, creates a new section containing
the references tag. Also repair malformed references tags.
Set the edit summary accordingly.
:param oldText: page text to be modified
:type oldText: str
:return: The modified pagetext
"""
if re.search(r"<references\s*?/>", oldText):
self.comment = f'{self.botname}: Etîketa <references /> bi şablona {{{{[[Şablon:Çavkanî|Çavkanî]]}}}} hat guhartin.'
newText = re.sub(r"<\s*?references\s*?/>", "{{Çavkanî}}", oldText)
return newText
parsed = mwparserfromhell.parse(oldText)
substCk = False
for template in parsed.filter_templates():
template_name = mytools.ucfirst(template.name)
if template_name == 'Çk':
substCk = True
template.name = 'subst:Çk'
if substCk:
cktemplate = '{{[[Şablon:Çk|Çk]]}}'
cksubsttemp = '{{[[Şablon:Çk|subst:Çk]]}}'
self.comment = f'{self.botname}: {cktemplate} kir {cksubsttemp}'
return str(parsed)
# Do we have a malformed <reference> tag which could be repaired?
# Set the edit summary for this case
self.comment = f'{self.botname}: Etîketa <references /> hat sererastkirin.'
# Repair two opening tags or an opening and an empty tag
pattern = re.compile(r'< *references *>(.*?)'
r'< */?\s*references */? *>', re.DOTALL)
if pattern.search(oldText):
pywikibot.info('Repairing references tag')
return re.sub(pattern, r'<references>\1</references>', oldText)
# Repair single unclosed references tag
pattern = re.compile(r'< *references *>')
if pattern.search(oldText):
pywikibot.info('Repairing references tag')
return re.sub(pattern, '<references />', oldText)
# Is there an existing section where we can add the references tag?
# Set the edit summary for this case
self.comment = f'{self.botname}: Şablona {{{{[[Şablon:çavkanî|Çavkanî]]}} lê hat zêdekirin.'
for section in i18n.translate(self.site, referencesSections):
sectionR = re.compile(fr'\r?\n=+ *{section} *=+ *\r?\n')
index = 0
while index < len(oldText):
match = sectionR.search(oldText, index)
if match:
if textlib.isDisabled(oldText, match.start()):
pywikibot.info(f'Existing {section} section is '
f'commented out, skipping.')
index = match.end()
else:
pywikibot.info(f'Adding references tag to existing'
f' {section} section...\n')
templates_or_comments = re.compile(
r'^((?:\s*(?:\{\{[^\{\}]*?\}\}|<!--.*?-->))*)',
flags=re.DOTALL)
new_text = (
oldText[:match.end() - 1]
+ templates_or_comments.sub(
fr'\n{self.referencesText}\1',
oldText[match.end() - 1:]))
return new_text
else:
break
# Create a new section for the references tag
for section in i18n.translate(self.site, placeBeforeSections):
# Find out where to place the new section
sectionR = re.compile(r'\r?\n(?P<ident>=+) *{} *(?P=ident) *\r?\n'
.format(section))
index = 0
while index < len(oldText):
match = sectionR.search(oldText, index)
if match:
if textlib.isDisabled(oldText, match.start()):
pywikibot.info(
'Existing {} section is commented out, '
"won't add the references in front of it."
.format(section))
index = match.end()
else:
pywikibot.info(
'Adding references section before {} section...\n'
.format(section))
index = match.start()
ident = match['ident']
return self.createReferenceSection(oldText, index,
ident)
else:
break
# This gets complicated: we want to place the new references
# section over the interwiki links and categories, but also
# over all navigation bars, persondata, and other templates
# that are at the bottom of the page. So we need some advanced
# regex magic.
# The strategy is: create a temporary copy of the text. From that,
# keep removing interwiki links, templates etc. from the bottom.
# At the end, look at the length of the temp text. That's the position
# where we'll insert the references section.
catNamespaces = '|'.join(self.site.namespaces.CATEGORY)
categoryPattern = fr'\[\[\s*({catNamespaces})\s*:[^\n]*\]\]\s*'
interwikiPattern = r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]\s*'
# won't work with nested templates
# the negative lookahead assures that we'll match the last template
# occurrence in the temp text.
# FIXME:
# {{commons}} or {{commonscat}} are part of Weblinks section
# * {{template}} is mostly part of a section
# so templatePattern must be fixed
templatePattern = r'\r?\n{{((?!}}).)+?}}\s*'
commentPattern = r'<!--((?!-->).)*?-->\s*'
metadataR = re.compile(r'(\r?\n)?({}|{}|{}|{})$'
.format(categoryPattern, interwikiPattern,
templatePattern, commentPattern),
re.DOTALL)
tmpText = oldText
while True:
match = metadataR.search(tmpText)
if match:
tmpText = tmpText[:match.start()]
else:
break
pywikibot.info(
'Found no section that can be preceded by a new references '
'section.\nPlacing it before interwiki links, categories, and '
'bottom templates.')
index = len(tmpText)
return self.createReferenceSection(oldText, index)
def createReferenceSection(self, oldText, index, ident: str = '==') -> str:
"""Create a reference section and insert it into the given text.
:param oldText: page text that is going to be be amended
:type oldText: str
:param index: the index of oldText where the reference section should
be inserted at
:type index: int
:param ident: symbols to be inserted before and after reference section
title
:return: the amended page text with reference section added
"""
if self.site.code in noTitleRequired:
ref_section = f'\n\n{self.referencesText}\n'
else:
ref_section = '\n\n{ident} {title} {ident}\n{text}\n'.format(
title=i18n.translate(self.site, referencesSections)[0],
ident=ident, text=self.referencesText)
newText = oldText[:index].rstrip() + ref_section + oldText[index:]
# Substituting multiple occurrences of "Çavkanî" with a single occurrence in the entire newText
newText = re.sub(r'==\s*Çavkanî\s*==\s*==\s*Çavkanî\s*==', '== Çavkanî ==', newText)
text, koord_templates, templates, categories, defaultsort, template_sitil, kontroloto = reorder_templates_and_remove_categories(
newText)
newText = append_templates_and_categories(text, koord_templates, templates, categories, defaultsort,
template_sitil, kontroloto)
return newText
def skip_page(self, page):
"""Check whether the page could be processed."""
if super().skip_page(page):
return True
# Check if the page is in one of the specified categories
categories_to_skip = [
'Kategorî:Gotara bi kirmaşanî',
'Kategorî:Gotara bi kurdiya başûr',
'Kategorî:Gotara bi soranî',
'Kategorî:Gotara bi zazakî'
]
page_categories = [cat.title() for cat in page.categories()]
if any(category in page_categories for category in categories_to_skip):
pywikibot.warning(
'Page {} is in a specified category. Skipping.'
.format(page.title(as_link=True)))
return True
if self.site.sitename == 'wikipedia:en' and page.isIpEdit():
pywikibot.warning(
'Page {} is edited by IP. Possible vandalized'
.format(page.title(as_link=True)))
return True
return False
def do_kozmetik(self, old_text):
kozmetik_cebu = ""
cc_toolkit = CosmeticChangesToolkit(self.current_page,
ignore=self.opt.ignore)
new_text, summaries = cc_toolkit.change(old_text)
applied_summaries = ', '.join(summaries.values())
if new_text is not False and new_text != old_text:
kozmetik_cebu = "; paqijiyên kozmetîk"
if applied_summaries:
kozmetik_cebu += f' ({applied_summaries}.)'
return new_text, kozmetik_cebu
def treat_page(self) -> None:
"""Run the bot."""
page = self.current_page
try:
text = page.text
except LockedPageError:
pywikibot.warning('Page {} is locked?!'
.format(page.title(as_link=True)))
return
if self.lacksReferences(text):
new_text = self.addReferences(text)
cleaned_new_text, kozmetik_cebu = self.do_kozmetik(new_text)
self.put_current(cleaned_new_text, summary=f"{self.comment}{kozmetik_cebu}")
def main(*args: str) -> None:
"""
Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
:param args: command line arguments
"""
options = {}
gen = None
# Process global args and prepare generator args parser
local_args = pywikibot.handle_args(args)
genFactory = pagegenerators.GeneratorFactory()
for arg in local_args:
opt, _, value = arg.partition(':')
if opt == '-xml':
xmlFilename = value or i18n.input('pywikibot-enter-xml-filename')
gen = XmlDumpNoReferencesPageGenerator(xmlFilename)
elif opt == '-always':
options['always'] = True
elif opt == '-quiet':
options['verbose'] = False
elif opt == '-ignore':
value = value.upper()
try:
options['ignore'] = getattr(CANCEL, value)
except AttributeError:
raise ValueError(f'Unknown ignore mode {value!r}!')
else:
genFactory.handle_arg(arg)
gen = genFactory.getCombinedGenerator(gen, preload=True)
if not gen:
site = pywikibot.Site()
cat = site.page_from_repository(maintenance_category)
if cat:
gen = cat.articles(namespaces=genFactory.namespaces or [0])
if gen:
bot = NoReferencesBot(generator=gen, **options)
bot.run()
else:
pywikibot.bot.suggest_help(missing_generator=True)
if __name__ == '__main__':
main()