Here naverokê

Bikarhêner:Wikihez/skrîpt/py/getlisteyacategorize.py

Ji Wîkîpediya, ensîklopediya azad.
#!/usr/bin/python3
"""
python pwb.py updatewin -file:"getlisteyacategorize.py" -s:"+remove dupes"
"""
import pywikibot
import pymysql
import os
import time

VERBOSE = True


def list_intersection(list1, list2):
    list3 = [value for value in list1 if value in list2]
    return list3


def check_eligibility_en(candidate):
    """Determine if the category is addable."""
    cat = pywikibot.Page(pywikibot.Site("en", "wikipedia"), "Category:%s" % candidate)
    cat_cats = get_existing_cats(cat)
    ineligible_parents = [
        "Hidden categories",
        "Tracking categories",
        "Turkish people by occupation",
        "Cities in Turkey",
        "Turkish Kurdistan",
        "Iraqi Kurdistan",
        "Syrian Kurdistan",
        "Iranian Kurdistan",
        "Stub categories"
    ]

    # Exclude certain patterns from skipping
    excluded_patterns = ["articles", "Wikiproject", "User"]

    if any(pattern in candidate.lower() for pattern in excluded_patterns):
        return False  # Include even if it matches excluded patterns
    if len(list_intersection(ineligible_parents, cat_cats)) > 0:
        return False
    return True


def get_existing_cats(page):
    """Get a list() of categories the page is in."""
    cats = list(page.categories())
    cat_titles = list()
    for c in cats:
        cat_titles.append(c.title(with_ns=False))
    return cat_titles


def get_list(lang_code, query_name, query_params=None):
    # Database connection details
    # Note: If you are using Toolforge, you may ignore the database username and password
    db_hostname_format = lang_code + "wiki.analytics.db.svc.wikimedia.cloud"  # Hostname of the database server
    db_port = 3306  # Port number for the database server
    # db_username = ""  # Add your actual database username credential (if not using Toolforge)
    # db_password = ""  # Add your actual database password credential (if not using Toolforge)
    db_name_format = lang_code + "wiki_p"  # Name of the target database
    db_connect_file = "~/replica.my.cnf"  # path to the "my.cnf" file

    # Create a connection to the database
    connection = pymysql.connect(
        host=db_hostname_format,
        port=db_port,
        # user=db_username,
        # password=db_password,
        database=db_name_format,
        read_default_file=db_connect_file,
        # "my.cnf" file contains user and password and read these parameters from under the [client] section.
        charset='utf8'
    )

    # Create a cursor
    cursor = connection.cursor()

    # Use the correct wiki database
    cursor.execute("USE " + lang_code + "wiki_p;")

    # Execute the query with parameters if provided
    if query_params is not None:
        cursor.execute(query_name, query_params)
    else:
        cursor.execute(query_name)

    # Fetch the results
    results = cursor.fetchall()

    # Close the cursor and the database connection
    cursor.close()
    connection.close()

    return results


# Add delay function
def add_delay(seconds):
    print(f"Waiting for {seconds} seconds...")
    time.sleep(seconds)


# query
"""
ku_query returns the en.wiki titles with ns for the members of the categories created in the last 24 hours:
Kategorî:Avabûnên 1670î li Hindistanê
"""
ku_query = """
SELECT
  langlinks.ll_title AS enwiki_sitelink
FROM
  categorylinks
JOIN
  page ON cl_from = page_id
LEFT JOIN
  page_props ON page.page_id = page_props.pp_page AND page_props.pp_propname = 'hiddencat'
LEFT JOIN (
  SELECT
    rev_page,
    MIN(rev_timestamp) AS first_revision_timestamp
  FROM
    revision
  GROUP BY
    rev_page
) AS first_revision ON page_id = first_revision.rev_page
LEFT JOIN
  pagelinks ON page_id = pagelinks.pl_from
INNER JOIN
  langlinks ON cl_from = langlinks.ll_from AND langlinks.ll_lang = 'en'
WHERE
  page_namespace = 14
  AND page.page_touched >= NOW() - INTERVAL 1 DAY
  AND first_revision.first_revision_timestamp >= NOW() - INTERVAL 1 DAY
  AND page_props.pp_page IS NULL
  AND NOT EXISTS (
    SELECT
      1
    FROM
      templatelinks
    JOIN
      page AS template_page ON tl_from = template_page.page_id
    JOIN
      revision AS template_revision ON template_page.page_latest = template_revision.rev_id
    WHERE
      tl_from = cl_from
      AND template_page.page_namespace = 10
      AND template_page.page_title = 'Kategoriya_çavdêriyê'
  )
  AND NOT EXISTS (
    SELECT
      1
    FROM
      templatelinks
    JOIN
      linktarget ON lt_id = tl_target_id
    WHERE
      tl_from = cl_from
      AND linktarget.lt_namespace = 10
      AND linktarget.lt_title = 'Kategoriya_çavdêriyê'
  )
GROUP BY
  enwiki_sitelink
ORDER BY
  MAX(cl_timestamp) DESC;
"""

"""
en_query returns ku.wiki sitelinks for pages that belong to a given en.wiki category.
cl_to stores the category name without the 'Category:' prefix, with spaces as underscores.
Example result: Kategorî:Avabûnên 1670î li Hindistanê
"""
en_query = """
SELECT
    ll.ll_title AS ku_page_title
FROM
    enwiki_p.page
INNER JOIN enwiki_p.langlinks AS ll
    ON ll.ll_from = page.page_id
    AND ll.ll_lang = 'ku'
INNER JOIN enwiki_p.categorylinks AS cl
    ON cl.cl_from = page.page_id
INNER JOIN enwiki_p.linktarget AS lt
    ON lt.lt_id = cl.cl_target_id
WHERE
    page.page_namespace IN (0, 14)
    AND lt.lt_namespace = 14
    AND lt.lt_title = %s
GROUP BY
    ll.ll_title
ORDER BY
    ll.ll_title;
"""

ku_results = get_list("ku", ku_query)

output_results = []

for row in ku_results:
    enwiki_sitelink = row[0].decode('utf-8')

    if VERBOSE:
        print(f"Result: {enwiki_sitelink}\nWaiting for two seconds")

    add_delay(2)  # Add a delay of 2 seconds before each query

    if check_eligibility_en(enwiki_sitelink):
        # Build the cl_to value: strip 'Category:' prefix and replace spaces with underscores.
        # Use a separate variable to avoid mutating enwiki_sitelink.
        cat_name = enwiki_sitelink.replace('Category:', '').replace(' ', '_')

        if VERBOSE:
            print(f"Getting list of pages in {cat_name}")

        en_query_results = get_list("en", en_query, (cat_name,))
        decoded_en_query_results = [
            (result[0].decode('utf-8'),) for result in en_query_results
        ]
        output_results.extend(decoded_en_query_results)

if VERBOSE:
    for result in output_results:
        print(f"output_results: {result}")
    print(f"Final output_results: {output_results}")
toolforge_home = os.getenv('HOMEPATH', '/data/project/balyozbot/')
listeyacategorize = os.path.join(toolforge_home, 'getlisteyacategorize.txt')

# 1. Remove duplicates while keeping order (based on result[0])
seen = set()
unique_results = []
for result in output_results:
    page_name = result[0]
    if page_name not in seen:
        unique_results.append(page_name)
        seen.add(page_name)

# 2. Overwrite the file with unique results
with open(listeyacategorize, 'w', encoding='utf-8') as file:
    for page_name in unique_results:
        file.write(page_name + "\n")

# 3. Check if we have pages to process
num_pages = len(unique_results)

if num_pages > 0:
    print(f"Working on {num_pages} pages")
    # Using the absolute path for the file in the command is safer
    os.system(f"$HOME/pwbvenv/bin/pwb categorizewithcreator -always -lang:ku -family:wikipedia -file:getlisteyacategorize.txt")
else:
    print("No changes detected.")