Bikarhêner:Balyozxane/skrîpt/py/getlisteyacats.py

Ji Wîkîpediya, ensîklopediya azad.
#!/usr/bin/python3
import pywikibot
import pymysql
import re
import os

def list_intersection(list1, list2):
    list3 = [value for value in list1 if value in list2]
    return list3

def check_eligibility_en(candidate):
    """Determine if the category is addable."""
    cat = pywikibot.Page(pywikibot.Site("en", "wikipedia"), "Category:%s" % candidate)
    cat_cats = get_existing_cats(cat)
    ineligible_parents = [
        "Hidden categories",
        "Tracking categories",
        "Turkish people by occupation",
        "Cities in Turkey",
        "Turkish Kurdistan",
        "Iraqi Kurdistan",
        "Syrian Kurdistan",
        "Iranian Kurdistan",
        "Stub categories"
    ]

    # Exclude certain patterns from skipping
    excluded_patterns = ["articles", "Wikiproject", "User"]
    
    if any(pattern in candidate.lower() for pattern in excluded_patterns):
        return False  # Include even if it matches excluded patterns
    if len(list_intersection(ineligible_parents, cat_cats)) > 0:
        return False
    return True

def get_existing_cats(page):
    """Get a list() of categories the page is in."""
    cats = list(page.categories())
    cat_titles = list()
    for c in cats:
        cat_titles.append(c.title(with_ns=False))
    return cat_titles

def getList(lang_code, query_name, file_name, query_params=None):
    # Database connection details
    # Note: If you are using Toolforge, you may ignore the database username and password
    db_hostname_format = lang_code + "wiki.analytics.db.svc.wikimedia.cloud"  # Hostname of the database server
    db_port = 3306  # Port number for the database server
    # db_username = ""  # Add your actual database username credential (if not using Toolforge)
    # db_password = ""  # Add your actual database password credential (if not using Toolforge)
    db_name_format = lang_code + "wiki_p"  # Name of the target database
    db_connect_file = "~/replica.my.cnf" # path to the "my.cnf" file

    # Create a connection to the database
    connection = pymysql.connect(
        host=db_hostname_format,
        port=db_port,
        # user=db_username,
        # password=db_password,
        database=db_name_format,
        read_default_file=db_connect_file, # "my.cnf" file contains user and password and read these parameters from under the [client] section.
        charset='utf8'
    )

    # Create a cursor
    cursor = connection.cursor()

    # Use the kuwiki_p database
    cursor.execute("USE " + lang_code + "wiki_p;")

    # Execute the query with parameters if provided
    if query_params is not None:
        cursor.execute(query_name, query_params)
    else:
        cursor.execute(query_name)

    # Fetch the results
    results = cursor.fetchall()

    # Close the cursor and the database connection
    cursor.close()
    connection.close()
    
    toolforge_home = os.getenv('HOMEPATH', '/data/project/balyozbot/')

    # Write the results to a temporary file
    temp_output_file_path = os.path.join(toolforge_home, file_name + '_temp')

    if results:
        with open(temp_output_file_path, 'w', encoding='utf-8') as output_file:
            for result in results:
                ku_page_title = result[0]  # Assuming there's only one column in the result
                ku_page_title = ku_page_title.decode('utf-8')
                ku_page_title = ku_page_title.replace('_', ' ')
                output_file.write(f"{ku_page_title}\n")
    else:
        print("No results found from the query.")

    return temp_output_file_path
# query

ku_query = """
SELECT
  langlinks.ll_title AS enwiki_sitelink
FROM
  categorylinks
JOIN
  page ON cl_from = page_id
LEFT JOIN
  page_props ON page.page_id = page_props.pp_page AND page_props.pp_propname = 'hiddencat'
LEFT JOIN (
  SELECT
    rev_page,
    MIN(rev_timestamp) AS first_revision_timestamp
  FROM
    revision
  GROUP BY
    rev_page
) AS first_revision ON page_id = first_revision.rev_page
LEFT JOIN
  pagelinks ON page_id = pagelinks.pl_from
INNER JOIN  -- Change LEFT JOIN to INNER JOIN
  langlinks ON cl_from = langlinks.ll_from AND langlinks.ll_lang = 'en'
WHERE
  page_namespace = 14
  AND page.page_touched >= NOW() - INTERVAL 1 DAY
  AND first_revision.first_revision_timestamp >= NOW() - INTERVAL 1 DAY
  AND page_props.pp_page IS NULL
  AND NOT EXISTS (
    SELECT
      1
    FROM
      templatelinks
    JOIN
      page AS template_page ON tl_from = template_page.page_id
    JOIN
      revision AS template_revision ON template_page.page_latest = template_revision.rev_id
    WHERE
      tl_from = cl_from
      AND template_page.page_namespace = 10
      AND template_page.page_title = 'Kategoriya_çavdêriyê'
  )
  AND NOT EXISTS (
    SELECT
      1
    FROM
      templatelinks
    JOIN
      linktarget ON lt_id = tl_target_id
    WHERE
      tl_from = cl_from
      AND linktarget.lt_namespace = 10
      AND linktarget.lt_title = 'Kategoriya_çavdêriyê'
  )
GROUP BY
  enwiki_sitelink
ORDER BY
  MAX(cl_timestamp) DESC;
"""
en_file_name = 'getlisteyacategorizeEnKategori'    
ku_file_name = 'getlisteyacategorizeKuKategori'  

#en_file_path = getList("en", en_query, en_file_name)
ku_file_path = getList("ku", ku_query, ku_file_name)

toolforge_home = os.getenv('HOMEPATH', '/data/project/balyozbot/')

output_file_path = os.path.join(toolforge_home, 'getlisteyacategorize.txt')
# Clear the file by opening it in write mode ('w')
with open(output_file_path, 'w', encoding='utf-8') as clear_file:
    clear_file.write('')
    
with open(ku_file_path, 'r', encoding='utf-8') as ku_file:

    ku_results = set(ku_file.read().splitlines())
    unique_results_set = set()

    for enwiki_sitelink in ku_results:
        if check_eligibility_en(enwiki_sitelink):
            enwiki_sitelink = enwiki_sitelink.replace(' ', '_')
            enwiki_sitelink = enwiki_sitelink.replace('Category:', '')
            en_query = """
            SELECT
                ll.ll_title AS ku_page_title
            FROM
                page
            INNER JOIN langlinks AS ll ON ll.ll_from = page.page_id AND ll.ll_lang = 'ku'
            INNER JOIN categorylinks AS cl ON cl.cl_from = page.page_id
            WHERE 
                page.page_namespace IN (0,14)
                AND cl.cl_to = %s
            GROUP BY
                ku_page_title
            ORDER BY ku_page_title;
            """

            en_query_result_path = getList("en", en_query, en_file_name, (enwiki_sitelink,))

            try:
                with open(en_query_result_path, 'r', encoding='utf-8') as en_query_result_file:
                    en_query_results = set(en_query_result_file.read().splitlines())

                    # Check and append only unique results to the output file
                    for en_result in en_query_results:
                        if en_result not in unique_results_set:
                            unique_results_set.add(en_result)
                            with open(output_file_path, 'a', encoding='utf-8') as output_file:
                                output_file.write(f"{en_result}\n")

            except Exception as e:
                print(f"Error processing enwiki_sitelink '{enwiki_sitelink}': {str(e)}")

                # Remove the temporary en_query result file
os.remove(en_query_result_path)

os.remove(ku_file_path)

# Check if the file is empty
with open(output_file_path, 'r', encoding='utf-8') as check_file:
    file_content = check_file.read().strip()

# Count the number of lines in the file content
num_lines = file_content.count('\n')

# If the file is not empty, print the number of lines and proceed with categorization
if num_lines > 0:
    print(f"Working on {num_lines  + 1} pages")
    os.system(f"$HOME/pwbvenv/bin/pwb categorize -always -lang:ku -family:wikipedia -file:getlisteyacategorize.txt")
else:
    print("No changes detected.")