106 lines
2.7 KiB
Python
106 lines
2.7 KiB
Python
from mwclient import Site, APIError
|
|
from mwclient.page import Page
|
|
import argparse, json, sys
|
|
|
|
ap = argparse.ArgumentParser("")
|
|
ap.add_argument("--wikiprotocol", default="http")
|
|
ap.add_argument("--wikihost", default="localhost")
|
|
ap.add_argument("--wikipath", default="/mw/")
|
|
ap.add_argument("--limit", default=None)
|
|
ap.add_argument("--output", default=sys.stdout, type=argparse.FileType('w'))
|
|
# ap.add_argument("--user", default=None)
|
|
# ap.add_argument("--password", default=None)
|
|
args = ap.parse_args()
|
|
|
|
site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
|
|
|
|
|
|
def category_subcats (site, cattitle, objects=True):
|
|
cmcontinue = None
|
|
ret = []
|
|
while True:
|
|
if cmcontinue == None:
|
|
resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50)
|
|
else:
|
|
resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50, cmcontinue=cmcontinue)
|
|
ret.extend([x['title'] for x in resp['query']['categorymembers']])
|
|
if 'continue' in resp:
|
|
cmcontinue = resp['continue']['cmcontinue']
|
|
else:
|
|
break
|
|
if objects:
|
|
# print "converting to page objects ({0})".format(len(ret))
|
|
ret = [site.pages[x] for x in ret]
|
|
return ret
|
|
|
|
"""
|
|
>>> c.page_title
|
|
'Équipes'
|
|
>>> c.name
|
|
'Catégorie:Équipes'
|
|
"""
|
|
"""
|
|
graph = {
|
|
nodes: [
|
|
{name: "Bienvenue"}
|
|
]
|
|
links: [
|
|
{source: "name", target: "name2"}
|
|
]
|
|
}
|
|
"""
|
|
redirects = {}
|
|
pages = []
|
|
count = 0
|
|
all_links = set()
|
|
page_exists = {}
|
|
|
|
def resolve (x):
|
|
while x in redirects:
|
|
x = redirects[x]
|
|
return x
|
|
|
|
print ("Pass 1")
|
|
for p in site.allpages():
|
|
r = p.redirects_to()
|
|
if (r):
|
|
redirects[p.name] = r.name
|
|
else:
|
|
page_exists[p.name] = True
|
|
|
|
print ("Pass 2")
|
|
# pages_by_title = {}
|
|
for p in site.allpages():
|
|
if (p.name in redirects):
|
|
continue
|
|
pd = {}
|
|
pd['title'] = p.page_title
|
|
pd['ns'] = 0
|
|
print ("Page {0}".format(p.name), file=sys.stderr)
|
|
|
|
# categories
|
|
cats = [c.page_title for c in p.categories()]
|
|
pd['cats'] = cats
|
|
|
|
# links
|
|
links = [x for x in p.links() if x.namespace == 0]
|
|
links = [resolve(x.name) for x in links]
|
|
links = [x for x in links if x in page_exists]
|
|
for l in links:
|
|
if p.name < l:
|
|
link = (p.name, l)
|
|
else:
|
|
link = (l, p.name)
|
|
all_links.add(link)
|
|
|
|
pages.append(pd)
|
|
count += 1
|
|
if args.limit and count >= args.limit:
|
|
break
|
|
|
|
graph = {}
|
|
graph['nodes'] = pages
|
|
graph['links'] = [{'source': a, 'target': b} for a, b in all_links]
|
|
graph['redirects'] = redirects
|
|
print (json.dumps(graph, indent=2), file=args.output)
|