Gebruiker:Naudefj/Linkbot

in Wikipedia, die vrye ensiklopedie
Jump to navigation Jump to search

Python 3.x-weergawe[wysig | wysig bron]

  1 #!/usr/bin/python3
  2 # -*- coding: utf-8  -*-
  3 """
  4 Robot om webskakels op die Wayback Machine te argiveer.
  5 
  6 Die volgende parameters word ondersteun:
  7 
  8 &params;
  9 
 10 -dry              Moenie enige veranderinge maak nie, maar wys wat verander sou word.
 11 
 12 """
 13 #
 14 __version__ = '$Id: 9c315ea1c38f5f9f2d74a4e8929403ffa35b2987 $'
 15 #
 16 
 17 import re
 18 import time
 19 import json
 20 import urllib
 21 from urllib.request import urlopen
 22 from urllib.parse import urlparse
 23 import ssl
 24 import pywikibot
 25 from pywikibot import pagegenerators
 26 from pywikibot import i18n
 27 
 28 # This is required for the text that is shown when you run this script
 29 # with the parameter -help.
 30 docuReplacements = {
 31     '&params;': pagegenerators.parameterHelp
 32 }
 33 
 34 class bcolors:
 35     HEADER = '\033[95m'
 36     OKBLUE = '\033[94m'
 37     OKGREEN = '\033[92m'
 38     WARNING = '\033[93m'
 39     FAIL = '\033[91m'
 40     ENDC = '\033[0m'
 41     BOLD = '\033[1m'
 42     UNDERLINE = '\033[4m'
 43 
 44 class BasicBot:
 45     def __init__(self, generator, dry):
 46         """
 47         Constructor.
 48 
 49         Parameters:
 50             @param generator: The page generator that determines on which pages
 51                               to work.
 52             @type generator: generator.
 53             @param dry: If True, doesn't do any real changes, but only shows
 54                         what would have been changed.
 55             @type dry: boolean.
 56         """
 57         self.generator = generator
 58         self.dry = dry
 59 
 60         if hasattr(ssl, '_create_unverified_context'):
 61            ssl._create_default_https_context = ssl._create_unverified_context
 62 
 63         # Set the edit summary message
 64         site = pywikibot.Site()
 65         self.summary = i18n.twtranslate(site, 'basic-changing')
 66 
 67     def run(self):
 68         """ Verwerk elke bladsy van die generator. """
 69         for page in self.generator:
 70             self.treat(page)
 71 
 72     def treat(self, page):
 73         """ Laai die bladsy, wysig en stoor dit. """
 74         text = self.load(page)
 75         if not text:
 76             return
 77 
 78         ################################################################
 79         # NOTA: Here you can modify the text in whatever way you want. #
 80         ################################################################
 81 
 82         t = page.title(asLink=False);
 83         pywikibot.output('Verwerk bladsy %s%s%s...' % (bcolors.BOLD, t, bcolors.ENDC))
 84         talktext = '';
 85 
 86         it = re.findall(r'(https?://[^\s|<>{}\[\]]+)', text)
 87         it = list(set(it))	# Remove dups
 88         for wurl in it:
 89             if "archive.org/" in wurl: continue
 90             url = urllib.parse.quote(wurl.encode('utf8'), ':/')
 91             ia = urlopen('http://archive.org/wayback/available?url=%s' % url).read().decode('utf8')
 92             data = json.loads(ia)
 93 #
 94 #{"archived_snapshots":
 95 #	{"closest":
 96 #		{"available":true,
 97 #		 "url":"http://web.archive.org/web/20150223120334/http://www.orafaq.com:80/forum/?",
 98 #		 "timestamp":"20150223120334",
 99 #                "status":"200"
100 #                }
101 #         }
102 #}
103 
104             if '"available":true' in ia:
105                 # Page archived, check to see if original URL is still OK.
106                 aurl  = data["archived_snapshots"]["closest"]["url"]
107                 atime = data["archived_snapshots"]["closest"]["timestamp"]
108                 pywikibot.output('%s... URL %s is reeds geargiveer%s' % (bcolors.OKBLUE, wurl, bcolors.ENDC))
109                 try:
110                     ia = urlopen(wurl)
111                     rc = ia.getcode()
112                     if rc == 200:
113                         pywikibot.output('%s... ... URL is steeds OK%s' % (bcolors.OKBLUE, bcolors.ENDC))
114                     elif rc == 404:
115                         pywikibot.output('%s... ... 404 verander na %s%s' % (bcolors.FAIL, aurl, bcolors.ENDC))
116                         pywikibot.output('%s... ... {{Wayback|url=%s|date=%s}}%s' % (bcolors.FAIL, url, atime, bcolors.ENDC))
117                         talktext += "|-\n| %s || {{Wayback|url=%s|date=%s}}\n" % (wurl, url, atime)
118                         print(talktext)
119                     else:
120                         pywikibot.output('%s... ... HTTP %s-fout%s' % (bcolors.OKBLUE, rc, bcolors.ENDC))
121                 except IOError as e:
122                     pywikibot.output('%s... ... URL het aandag nodig %s%s' % (bcolors.FAIL, e, bcolors.ENDC))
123                     # pywikibot.output('%s... ... verander na %s%s' % (bcolors.FAIL, aurl, bcolors.ENDC))
124                     # pywikibot.output('%s... ... {{Wayback|url=%s|date=%s}}%s' % (bcolors.FAIL, url, atime, bcolors.ENDC))
125                     #if str(e) == 'HTTP Error 404: Not Found':
126                     #   talktext += "|-\n| %s || {{Wayback|url=%s|date=%s}}\n" % (wurl, url, atime)
127                     #   print(talktext)
128             else:
129                 try:
130                     ia = urlopen('https://web.archive.org/save/%s' % url).read()
131                 except IOError as e:
132                     pywikibot.output('%s... URL %s gee %s%s' % (bcolors.FAIL, url, e, bcolors.ENDC))
133                     ia = '';
134             if 'FILE ARCHIVED ON' in str(ia):
135                 pywikibot.output('%s... URL %s suksesvol opgelaai%s' % (bcolors.OKGREEN, wurl, bcolors.ENDC))
136             elif '403 Forbidden' in str(ia):
137                 pywikibot.output('... URL %s is geblokkeer op archive.org' % wurl)
138             elif '404: Not Found' in str(ia):
139                 pywikibot.output('... URL %s is dood' % wurl)
140             elif 'due to robots.txt' in str(ia):
141                 pywikibot.output('... URL %s is geblokkeer deur robots.txt' % wurl)
142             elif 'URL has been excluded from the Wayback Machine' in str(ia):
143                 pywikibot.output('... URL %s is deur Wayback Machine utgesluit' % wurl)
144             elif 'look like an valid URL' in str(ia):
145                 pywikibot.output('%s... URL %s lys soos ''n ongeldige URL%s' % (bcolors.WARNING, wurl, bcolors.ENDC))
146             elif 'url is not available on the live web' in str(ia):
147                 pywikibot.output('... URL %s is nie beskikbaar nie' % wurl)
148             else:
149                 #ia = str(ia)#, errors='ignore')
150                 pywikibot.output('... URL %s het gefaal: %s' % (wurl, ia))
151             time.sleep(1)
152 
153         if talktext:
154            talktext = "== Geargiveerde skakels ==\n{| class=\"wikitable\"\n|-\n! Dooie skakel !! Argief\n" + talktext + "|}"
155            talkpage = page.toggleTalkPage()
156            if talkpage.exists():
157               talktext_prev = talkpage.get()
158               if talktext == talktext_prev:
159                  pywikibot.output("Reeds gestoor - slaan oor...\n")
160               else:
161                  talktext_prev = re.sub(r'== Geargiveerde skakels ==.*\|\}', '', talktext_prev, flags=re.MULTILINE|re.DOTALL)
162                  if talktext_prev != '':
163                     talktext_prev += "\n\n"
164                  talktext_prev += talktext
165                  pywikibot.output('Nuwe blad: [' + talktext_prev + ']')
166                  talkpage.put(talktext_prev, 'Rapporteer dooie skakels wat geargiveer is');
167            else:
168               pywikibot.output('Skep bespreking: [' + talktext + ']')
169               talkpage.put(talktext, 'Rapporteer dooie skakels wat geargiveer is');
170         # if not self.save(text, page, self.summary):
171         #    pywikibot.output(u'Page %s not saved.' % page.title(asLink=True))
172 
173     def load(self, page):
174         """ Laai die teks van 'n gegewe bladsy. """
175         try:
176             # Load the page
177             text = page.get()
178         except pywikibot.NoPage:
179             pywikibot.output(u"Bladsy %s bestaan nie, slaan oor."
180                              % page.title(asLink=True))
181         except pywikibot.IsRedirectPage:
182             pywikibot.output(u"Bladsy %s is 'n aanstuur; slaan oor."
183                              % page.title(asLink=True))
184         else:
185             return text
186         return None
187 
188     def save(self, text, page, comment=None, minorEdit=True,
189              botflag=True):
190         """ Opdateer 'n gegewe bladsy met muwe teks. """
191         # only save if something was changed
192         if text != page.get():
193             # Show the title of the page we're working on.
194             # Highlight the title in purple.
195             pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
196                              % page.title())
197             # show what was changed
198             pywikibot.showDiff(page.get(), text)
199             pywikibot.output(u'Comment: %s' % comment)
200             if not self.dry:
201                 if pywikibot.input_yn(
202                         u'Do you want to accept these changes?',
203                         default=False, automatic_quit=False):
204                     try:
205                         page.text = text
206                         # Save the page
207                         page.save(comment=comment or self.comment,
208                                   minor=minorEdit, botflag=botflag)
209                     except pywikibot.LockedPage:
210                         pywikibot.output(u"Bladsy %s is gesluit; slaan oor."
211                                          % page.title(asLink=True))
212                     except pywikibot.EditConflict:
213                         pywikibot.output(
214                             u'Slaan %s as gevolg van ''n wysigingskonflik oor'
215                             % (page.title()))
216                     except pywikibot.SpamfilterError as error:
217                         pywikibot.output(
218                             u'Kan nie %s wysig nie, swattlys-inskrywing %s'
219                             % (page.title(), error.url))
220                     else:
221                         return True
222         return False
223 
224 
225 def main(*args):
226     """
227     Process command line arguments and invoke bot.
228 
229     If args is an empty list, sys.argv is used.
230 
231     @param args: command line arguments
232     @type args: list of unicode
233     """
234     # Process global arguments to determine desired site
235     local_args = pywikibot.handle_args(args)
236 
237     # This factory is responsible for processing command line arguments
238     # that are also used by other scripts and that determine on which pages
239     # to work on.
240     genFactory = pagegenerators.GeneratorFactory()
241     # The generator gives the pages that should be worked upon.
242     gen = None
243     # If dry is True, doesn't do any real changes, but only show
244     # what would have been changed.
245     dry = False
246 
247     # Parse command line arguments
248     for arg in local_args:
249         if arg.startswith("-dry"):
250             dry = True
251         else:
252             genFactory.handleArg(arg)
253 
254     if not gen:
255         gen = genFactory.getCombinedGenerator()
256     if gen:
257         # The preloading generator is responsible for downloading multiple
258         # pages from the wiki simultaneously.
259         gen = pagegenerators.PreloadingGenerator(gen)
260         bot = BasicBot(gen, dry)
261         bot.run()
262     else:
263         pywikibot.showHelp()
264 
265 if __name__ == "__main__":
266     main()