Bikarhêner:Wikihez/skrîpt/py/getlisteyacategorize.py
Xuyakirin
(Ji Bikarhêner:Wikihez/skrîpt/py/getlisteyacats.py hat beralîkirin)
#!/usr/bin/python3
"""
python pwb.py updatewin -file:"getlisteyacategorize.py" -s:"+remove dupes"
"""
import pywikibot
import pymysql
import os
import time
VERBOSE = True
def list_intersection(list1, list2):
list3 = [value for value in list1 if value in list2]
return list3
def check_eligibility_en(candidate):
"""Determine if the category is addable."""
cat = pywikibot.Page(pywikibot.Site("en", "wikipedia"), "Category:%s" % candidate)
cat_cats = get_existing_cats(cat)
ineligible_parents = [
"Hidden categories",
"Tracking categories",
"Turkish people by occupation",
"Cities in Turkey",
"Turkish Kurdistan",
"Iraqi Kurdistan",
"Syrian Kurdistan",
"Iranian Kurdistan",
"Stub categories"
]
# Exclude certain patterns from skipping
excluded_patterns = ["articles", "Wikiproject", "User"]
if any(pattern in candidate.lower() for pattern in excluded_patterns):
return False # Include even if it matches excluded patterns
if len(list_intersection(ineligible_parents, cat_cats)) > 0:
return False
return True
def get_existing_cats(page):
"""Get a list() of categories the page is in."""
cats = list(page.categories())
cat_titles = list()
for c in cats:
cat_titles.append(c.title(with_ns=False))
return cat_titles
def get_list(lang_code, query_name, query_params=None):
# Database connection details
# Note: If you are using Toolforge, you may ignore the database username and password
db_hostname_format = lang_code + "wiki.analytics.db.svc.wikimedia.cloud" # Hostname of the database server
db_port = 3306 # Port number for the database server
# db_username = "" # Add your actual database username credential (if not using Toolforge)
# db_password = "" # Add your actual database password credential (if not using Toolforge)
db_name_format = lang_code + "wiki_p" # Name of the target database
db_connect_file = "~/replica.my.cnf" # path to the "my.cnf" file
# Create a connection to the database
connection = pymysql.connect(
host=db_hostname_format,
port=db_port,
# user=db_username,
# password=db_password,
database=db_name_format,
read_default_file=db_connect_file,
# "my.cnf" file contains user and password and read these parameters from under the [client] section.
charset='utf8'
)
# Create a cursor
cursor = connection.cursor()
# Use the correct wiki database
cursor.execute("USE " + lang_code + "wiki_p;")
# Execute the query with parameters if provided
if query_params is not None:
cursor.execute(query_name, query_params)
else:
cursor.execute(query_name)
# Fetch the results
results = cursor.fetchall()
# Close the cursor and the database connection
cursor.close()
connection.close()
return results
# Add delay function
def add_delay(seconds):
print(f"Waiting for {seconds} seconds...")
time.sleep(seconds)
# query
"""
ku_query returns the en.wiki titles with ns for the members of the categories created in the last 24 hours:
Kategorî:Avabûnên 1670î li Hindistanê
"""
ku_query = """
SELECT
langlinks.ll_title AS enwiki_sitelink
FROM
categorylinks
JOIN
page ON cl_from = page_id
LEFT JOIN
page_props ON page.page_id = page_props.pp_page AND page_props.pp_propname = 'hiddencat'
LEFT JOIN (
SELECT
rev_page,
MIN(rev_timestamp) AS first_revision_timestamp
FROM
revision
GROUP BY
rev_page
) AS first_revision ON page_id = first_revision.rev_page
LEFT JOIN
pagelinks ON page_id = pagelinks.pl_from
INNER JOIN
langlinks ON cl_from = langlinks.ll_from AND langlinks.ll_lang = 'en'
WHERE
page_namespace = 14
AND page.page_touched >= NOW() - INTERVAL 1 DAY
AND first_revision.first_revision_timestamp >= NOW() - INTERVAL 1 DAY
AND page_props.pp_page IS NULL
AND NOT EXISTS (
SELECT
1
FROM
templatelinks
JOIN
page AS template_page ON tl_from = template_page.page_id
JOIN
revision AS template_revision ON template_page.page_latest = template_revision.rev_id
WHERE
tl_from = cl_from
AND template_page.page_namespace = 10
AND template_page.page_title = 'Kategoriya_çavdêriyê'
)
AND NOT EXISTS (
SELECT
1
FROM
templatelinks
JOIN
linktarget ON lt_id = tl_target_id
WHERE
tl_from = cl_from
AND linktarget.lt_namespace = 10
AND linktarget.lt_title = 'Kategoriya_çavdêriyê'
)
GROUP BY
enwiki_sitelink
ORDER BY
MAX(cl_timestamp) DESC;
"""
"""
en_query returns ku.wiki sitelinks for pages that belong to a given en.wiki category.
cl_to stores the category name without the 'Category:' prefix, with spaces as underscores.
Example result: Kategorî:Avabûnên 1670î li Hindistanê
"""
en_query = """
SELECT
ll.ll_title AS ku_page_title
FROM
enwiki_p.page
INNER JOIN enwiki_p.langlinks AS ll
ON ll.ll_from = page.page_id
AND ll.ll_lang = 'ku'
INNER JOIN enwiki_p.categorylinks AS cl
ON cl.cl_from = page.page_id
INNER JOIN enwiki_p.linktarget AS lt
ON lt.lt_id = cl.cl_target_id
WHERE
page.page_namespace IN (0, 14)
AND lt.lt_namespace = 14
AND lt.lt_title = %s
GROUP BY
ll.ll_title
ORDER BY
ll.ll_title;
"""
ku_results = get_list("ku", ku_query)
output_results = []
for row in ku_results:
enwiki_sitelink = row[0].decode('utf-8')
if VERBOSE:
print(f"Result: {enwiki_sitelink}\nWaiting for two seconds")
add_delay(2) # Add a delay of 2 seconds before each query
if check_eligibility_en(enwiki_sitelink):
# Build the cl_to value: strip 'Category:' prefix and replace spaces with underscores.
# Use a separate variable to avoid mutating enwiki_sitelink.
cat_name = enwiki_sitelink.replace('Category:', '').replace(' ', '_')
if VERBOSE:
print(f"Getting list of pages in {cat_name}")
en_query_results = get_list("en", en_query, (cat_name,))
decoded_en_query_results = [
(result[0].decode('utf-8'),) for result in en_query_results
]
output_results.extend(decoded_en_query_results)
if VERBOSE:
for result in output_results:
print(f"output_results: {result}")
print(f"Final output_results: {output_results}")
toolforge_home = os.getenv('HOMEPATH', '/data/project/balyozbot/')
listeyacategorize = os.path.join(toolforge_home, 'getlisteyacategorize.txt')
# 1. Remove duplicates while keeping order (based on result[0])
seen = set()
unique_results = []
for result in output_results:
page_name = result[0]
if page_name not in seen:
unique_results.append(page_name)
seen.add(page_name)
# 2. Overwrite the file with unique results
with open(listeyacategorize, 'w', encoding='utf-8') as file:
for page_name in unique_results:
file.write(page_name + "\n")
# 3. Check if we have pages to process
num_pages = len(unique_results)
if num_pages > 0:
print(f"Working on {num_pages} pages")
# Using the absolute path for the file in the command is safer
os.system(f"$HOME/pwbvenv/bin/pwb categorizewithcreator -always -lang:ku -family:wikipedia -file:getlisteyacategorize.txt")
else:
print("No changes detected.")