Bikarhêner:Balyozxane/skrîpt/py/jsonGbingehin.py

Ji Wîkîpediya, ensîklopediya azad.
import pywikibot
import mwparserfromhell
import json
import re


def extract_section_info(page_title):
    """
    Extract section information (wikilinks) from a Wikipedia page.

    Args:
        page_title (str): The title of the Wikipedia page.

    Returns:
        dict: A dictionary containing information about wikilinks found in the page.
    """
    print(f"Extracting section info for page: {page_title}")

    site = pywikibot.Site('ku', 'wikipedia')
    page = pywikibot.Page(site, page_title)
    text = page.text
    wikicode = mwparserfromhell.parse(text)
    match = re.search(r'Level/(\d+)', page_title)
    level = int(match.group(1)) if match else 3

    if level == 4:
        topic = re.sub(r'Wîkîpediya:Gotarên bingehîn/Level/4/', '', page_title)

    sections = list(wikicode.get_sections(levels=[1, 2, 3, 4, 5, 6]))
    wikilinks_info = {}
    section_titles_count = {}

    for section in sections:
        section_headings = section.filter_headings()
        if not section_headings:
            print("No section headings found.")
            continue

        section_title = str(section_headings[0].title.strip())
        if section_title in section_titles_count:
            section_titles_count[section_title] += 1
        else:
            section_titles_count[section_title] = 1

        if section_titles_count[section_title] > 1:
            new_section_title = f"{section_title} {section_titles_count[section_title]}"
        else:
            new_section_title = section_title

        for template in section.filter_templates():

            template_name = template.name.strip_code().strip()
            if template_name == "lgb":
                if "en" in [param.name.strip_code().strip() for param in template.params]:
                    continue  # Skip templates with "en" parameter

                wikilink_title = template.get(1).value.strip_code()
                if "Wîkîpediya:" in wikilink_title or "Wikipedia:" in wikilink_title or "Kategorî:" in wikilink_title:
                    continue

                wikilink_info = {"level": level}

                if level == 4:
                    wikilink_info["topic"] = topic

                wikilink_info["section"] = str(new_section_title)
                wikilinks_info[wikilink_title] = wikilink_info


    return wikilinks_info


def save_page_content(site, page_title, content):
    """
    Save content to a page on the Kurdish Wikipedia.
    """
    page = pywikibot.Page(site, page_title)
    page.text = content

    page.save(summary="[[Bikarhêner:Balyozxane/skrîpt/py/jsonGbingehin.py|Bot]]: Naveroka rûpelê hat rojanekirin")


def process_wikilinks_info(wikilinks_info, root, site):
    """
    Process the wikilinks information and save them alphabetically on the Kurdish Wikipedia.
    """
    output = {}
    kurdish_alphabet = "ABCÇDEÊFGHIÎJKLMNOPQRSŞTUÛVWXYZ"
    for wikilink_title, info in wikilinks_info.items():
        first_letter = wikilink_title[0].upper()  # Get the first letter and convert it to uppercase
        if first_letter not in kurdish_alphabet:
            first_letter = "yên din"  # Use "Yên din" for letters not in the Kurdish alphabet

        if first_letter not in output:
            output[first_letter] = {}

        output[first_letter][wikilink_title] = info

    for letter, items in output.items():
        page_title = f"{root}/{letter}.json"
        content = json.dumps(items, ensure_ascii=False, indent=4)
        save_page_content(site, page_title, content)


if __name__ == "__main__":
    pages = [
        "Wîkîpediya:Gotarên bingehîn/Level/1",
        "Wîkîpediya:Gotarên bingehîn/Level/2",
        "Wîkîpediya:Gotarên bingehîn",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Biyolojî û zanistên sihetê",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Civak û zanistên civakî",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Cografya",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Felsefe û dîn",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Huner",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Jiyana rojane",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Matematîk",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Mirov",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Tarîx",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Teknolojî",
        "Wîkîpediya:Gotarên bingehîn/Level/4/Zanistên fizîkî"
    ]
    wikilinks_info_all = {}

    for page_title in pages:
        wikilinks_info = extract_section_info(page_title)
        for wikilink_title, info in wikilinks_info.items():
            if wikilink_title not in wikilinks_info_all:
                wikilinks_info_all[wikilink_title] = info

    root = "Wîkîpediya:Gotarên bingehîn/dane"
    site = pywikibot.Site('ku', 'wikipedia')
    process_wikilinks_info(wikilinks_info_all, root, site)