Moving from MediaWiki to SharePoint O365 - extractPages.php

This is the source code of the extractPages-script, which will create individual html files for all the articles contained in an XML dump from MediaWiki.

Don't forget to update the rootPath on line 2 and the yourSite MediaWiki namespaces on lines 49 - 51.




 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
<?php
$rootPath = "C:/something/UniServerZ/www/wikimigration/articles/";
$xmlDumpFilename = "backup-dump.xml";


$file = file_get_contents($rootPath.$xmlDumpFilename);
$pagecount = 0; $skipped = 0;
while(strpos($file, "<page>") !== FALSE){
 $pagecount++;
 
 //extract one page = one article
 $pageStartPos = strpos($file, "<page>");
 $pageEndPos = strpos($file, "</page>");
 $page = substr($file, $pageStartPos, $pageEndPos+7 - $pageStartPos);
 
 //the title of the article
 $titleStartPos = strpos($page, "<title>");
 $titleEndPos = strpos($page, "</title>");
 $title = substr($page, $titleStartPos+7, $titleEndPos - ($titleStartPos+7));
 
 //overwrite illegal filename characters to get a legal filename
 $filename = str_replace("<","__",$title);
 $filename = str_replace(">","__",$filename);
 $filename = str_replace(":","__",$filename);
 $filename = str_replace('"',"__",$filename);
 $filename = str_replace("/","__",$filename);
 $filename = str_replace("\\","__",$filename);
 $filename = str_replace("|","__",$filename);
 $filename = str_replace("?","__",$filename);
 $filename = str_replace("*","__",$filename);
 
 //the actual textual content of the article
 $textStartPos = strpos($page, "<text");
 $textStartPos = strpos($page, ">", $textStartPos) + 1;
 $textEndPos = strpos($page, "</text>");
 $text = substr($page, $textStartPos, $textEndPos-$textStartPos);
 
 //new content = (original) title + content
 $newContent = "<h1>$title</h1>
$text";
 
 $filename = str_replace("&amp;","&",$filename);
 
 if(strpos($filename, 'User__') !== false 
   or strpos($filename, 'Category__') !== false
   or strpos($filename, 'Talk__') !== false
   or strpos($filename, 'Module__') !== false
   or strpos($filename, 'MediaWiki__') !== false
   or strpos($filename, 'yourSite_talk__') !== false
   or strpos($filename, 'yourSite talk__') !== false   or strpos($filename, 'yourSite__') !== false   or strpos($filename, 'User_talk__') !== false
   or strpos($filename, 'File_talk__') !== false
   or strpos($filename, 'File__') !== false){
  echo "skipping file $filename\r\n";
  $skipped++;
 }
 //write page to file
 elseif(file_put_contents($rootPath."articlesSplit/$filename.html", $newContent) === false)
  echo "issue with article $title\r\n";
 
 //continue with the dump-file
 $file = substr($file, $pageEndPos+7); 
 
}
echo "$pagecount pages processed, $skipped skipped (in unrequired namespaces)";
?>

No comments:

Post a Comment