from mwclient import Site, APIError from mwclient.page import Page import argparse, json, sys ap = argparse.ArgumentParser("") ap.add_argument("--wikiprotocol", default="http") ap.add_argument("--wikihost", default="localhost") ap.add_argument("--wikipath", default="/mw/") ap.add_argument("--limit", default=None) ap.add_argument("--output", default=sys.stdout, type=argparse.FileType('w')) # ap.add_argument("--user", default=None) # ap.add_argument("--password", default=None) args = ap.parse_args() site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath) def category_subcats (site, cattitle, objects=True): cmcontinue = None ret = [] while True: if cmcontinue == None: resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50) else: resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50, cmcontinue=cmcontinue) ret.extend([x['title'] for x in resp['query']['categorymembers']]) if 'continue' in resp: cmcontinue = resp['continue']['cmcontinue'] else: break if objects: # print "converting to page objects ({0})".format(len(ret)) ret = [site.pages[x] for x in ret] return ret """ >>> c.page_title 'Équipes' >>> c.name 'Catégorie:Équipes' """ """ graph = { nodes: [ {name: "Bienvenue"} ] links: [ {source: "name", target: "name2"} ] } """ redirects = {} pages = [] count = 0 all_links = set() page_exists = {} def resolve (x): while x in redirects: x = redirects[x] return x print ("Pass 1") for p in site.allpages(): r = p.redirects_to() if (r): redirects[p.name] = r.name else: page_exists[p.name] = True print ("Pass 2") # pages_by_title = {} for p in site.allpages(): if (p.name in redirects): continue pd = {} pd['title'] = p.page_title pd['ns'] = 0 print ("Page {0}".format(p.name), file=sys.stderr) # categories cats = [c.page_title for c in p.categories()] pd['cats'] = cats # links links = [x for x in p.links() if x.namespace == 0] links = [resolve(x.name) for x in links] links = [x for x in links if x in page_exists] for l in links: if p.name < l: link = (p.name, l) else: link = (l, p.name) all_links.add(link) pages.append(pd) count += 1 if args.limit and count >= args.limit: break graph = {} graph['nodes'] = pages graph['links'] = [{'source': a, 'target': b} for a, b in all_links] graph['redirects'] = redirects print (json.dumps(graph, indent=2), file=args.output)