1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
| <?php
require_once('htmLawed.php'); // need to download htmlLawed
define('POSTTABLE', 'wp_posts');
define('DOMAIN', '{insert your domain here}');
define('SITEROOT', '{insert your site root here}');
define('DBNAME', '{insert your domain here}');
define('HOSTNAME', '{insert your hostname here - often localhost}');
define('DBUSER', '{insert your db username here}');
define('DBPWD', '{insert your db password here');
define('TESTINGSINGLE', false);
define('TESTID', '2534'); // test a single post ID
$dbname = DBNAME;
$conn = mysql_connect(HOSTNAME, DBUSER, DBPWD) or die ('Error connecting to mysql');
// Elements that htmLawed will restrict to
$elements = 'a, b, blockquote, br, caption, center, cite, code, col, colgroup, dd, dl, dt, em, embed, fieldset, h1, h2, h3, h4, h5, h6, hr, i, img, li, ol, p, small, strong, sub, sup, table, tbody, td, th, thead, tr, ul';
// htmLawed config variables
$config = array(
'clean_ms_char' => 2,
'tidy' => 1,
'elements' => $elements,
'cdata' => 1,
'comment' => 1,
'deny_attribute' => 'align'
);
mysql_select_db($dbname);
$doc = new DOMDocument();
$query = mysql_query("SELECT ID, post_date, post_title, post_name, post_content, post_type FROM " . POSTTABLE . " WHERE post_type IN ('post', 'page') AND post_status = 'publish'");
while($post = mysql_fetch_object($query)) {
if(TESTINGSINGLE == false || $post->ID == TESTID) {
print "\n\n" . '**opening ' . $post->ID . ' - name: ' . $post->post_name . "\n";
}
$oldpost = $post->post_content;
$doc->loadHTML($oldpost);
$newtabledom = new DOMDocument;
$xpath = new DOMXPath($doc);
$newtabledom = $doc;
$pagepath = new DOMXPath($newtabledom);
// Remove content inside certain classes inside page
// The below elements are leftover elements from Plone
// See XPath documentation for more details of how to make queries
$toremove= $pagepath->query("//h1[@class='documentFirstHeading'] | //p[@class='documentDescription'] | //div[@class='documentDescription'] | //div[@class='documentByLine'] | //div[@class='documentActions'] | //div[@id='relatedItems'] | //div[@class='discussion'] | //a[@id='documentContent'] | //a[@class='link-parent']");
foreach ($toremove as $entry) {
$entry->parentNode->removeChild($entry);
}
// We try and change classes of images to use the Wordpress floated classes
// The script detects classes (or parent div classes) that have the words
// left right or center and renames them
$imgtags = $doc->getElementsByTagName('img');
foreach($imgtags as $child) {
$linkclass = $child->attributes->getNamedItem('class')->nodeValue;
$alignclass = $child->attributes->getNamedItem('align')->nodeValue;
$linkfile = $child->attributes->getNamedItem('src')->nodeValue;
if(strpos($linkclass, 'left') !== false || strpos($alignclass, 'left') !== false) {
$child->setAttribute( 'class' , 'alignleft' );
}
else if (strpos($linkclass, 'right') !== false || strpos($alignclass, 'right') !== false) {
$child->setAttribute( 'class' , 'alignright' );
}
else if (strpos($linkclass, 'centre') !== false || strpos($linkclass, 'center') !== false
|| strpos($alignclass, 'centre') !== false || strpos($alignclass, 'center') !==false ) {
$imageinfo = @getimagesize(SITEROOT . $linkfile);
$child->setAttribute( 'class' , 'aligncenter' );
}
else {
// get parent
$parent = $child->parentNode;
if ($parent) {
$grandparent = $parent->parentNode;
$parentclass = $parent->attributes->getNamedItem('class')->nodeValue;
$parentalign = $parent->attributes->getNamedItem('align')->nodeValue;
$grandparentclass = $grandparent->attributes->getNamedItem('class')->nodeValue;
$grandparentalign = $grandparent->attributes->getNamedItem('align')->nodeValue;
}
else {
$parentclass = '';
$parentalign = '';
$grandparentclass = '';
$grandparentalign = '';
}
if (strpos($parentclass, 'left') !== false || strpos($parentalign, 'left') !== false) {
print "\n\n" . '** left parent ' . $linkfile . " - class: " . $parentclass . " - align: " . $parentalign. "\n";
$child->setAttribute( 'class' , 'alignleft' );
}
else if (strpos($parentclass, 'right') !== false || strpos($parentalign, 'left') !== false) {
print "\n\n" . '** right parent ' . $linkfile . " - class: " . $parentclass . " - align: " . $parentalign. "\n";
$child->setAttribute( 'class' , 'alignright' );
}
else if (strpos($parentclass, 'centre') !== false || strpos($parentclass, 'center') !== false
|| strpos($parentalign, 'centre') !== false || strpos($parentalign, 'center') !== false) {
print "\n\n" . '** centred parent ' . $linkfile . " - class: " . $parentclass . " - align: " . $parentalign. "\n";
$imageinfo = @getimagesize($linkfile);
$child->setAttribute( 'class' , 'aligncenter' );
}
else if (strpos($grandparentclass, 'left') !== false || strpos($grandparentalign, 'left') !== false) {
print "\n\n" . '** left grandparent ' . $linkfile . " - class: " . $grandparentclass . " - align: " . $grandparentalign. "\n";
$child->setAttribute( 'class' , 'alignleft' );
}
else if (strpos($grandparentclass, 'right') !== false || strpos($grandparentalign, 'left') !== false) {
print "\n\n" . '** right grandparent ' . $linkfile . " - class: " . $grandparentclass . " - align: " . $grandparentalign. "\n";
$child->setAttribute( 'class' , 'alignright' );
}
else if (strpos($grandparentclass, 'centre') !== false || strpos($grandparentclass, 'center') !== false
|| strpos($grandparentalign, 'centre') !== false || strpos($grandparentalign, 'center') !== false) {
print "\n\n" . '** centred grandparent ' . $linkfile . " - class: " . $grandparentclass . " - align: " . $grandparentalign. "\n";
$imageinfo = @getimagesize($linkfile);
$child->setAttribute( 'class' , 'aligncenter' );
}
}
}
// Replace underscores with dashes inside relative links
// we are excluding ../ links for now - too complicated
$atags = $doc->getElementsByTagName('a');
foreach($atags as $child) {
$linkhref = $child->attributes->getNamedItem('href')->nodeValue;
if (!(substr($linkhref, 0, 4) == 'http' || substr($linkhref, 0, 1) == '/' || substr($linkhref, 0, 3) == '../')) {
$parent = mysql_fetch_object(mysql_query("SELECT post_name FROM " . POSTTABLE . " WHERE post_parent = " . $post->post_parent));
// print "\n\n" . 'parent name: ' . $parent->post_name . "\n";
if(strpos($linkhref, '_') !== FALSE && (strpos($post->post_name, '-') !== FALSE || strpos($parent->post_name, '-') !== FALSE) ) {
print "\n\n" . 'link: ' . $linkhref . "\n";
$linkhref = str_replace('_', '-', $linkhref);
$linkhref = preg_replace('/--+/', '-', $linkhref);
print "\n\n" . 'changed internal link: ' . $linkhref . "\n";
$child->setAttribute( 'href' , $linkhref);
}
}
}
// Output HTML from query documents
$newtablehtml = $newtabledom->saveHTML();
// Text rewriting
// This can be modified to your needs
$newtablehtml = str_replace('[...]', '', $newtablehtml);
$newtablehtml = str_replace('/index.html"', '"', $newtablehtml);
$newtablehtml = str_replace('/"', '"', $newtablehtml);
$newtablehtml = str_replace('https://my.', 'http://www.', $newtablehtml);
// Sometimes there are encoding issues which need dealing with
$newtablehtml = str_replace('&Acirc;&nbsp;', '', $newtablehtml);
$newtablehtml = str_replace('&Acirc;', '', $newtablehtml);
$newtablehtml = str_replace('&#132;', '', $newtablehtml);
$newtablehtml = str_replace('&acirc;&#8364;&#8482', "'", $newtablehtml);
$newtablehtml = str_replace("&#145;", "'", $newtablehtml);
$newtablehtml = str_replace("&#146;", "'", $newtablehtml);
$newtablehtml = str_replace("&#147;", "'", $newtablehtml);
$newtablehtml = str_replace("&#148;", "'", $newtablehtml);
$newtablehtml = str_replace("&#150;", " - ", $newtablehtml);
$newtablehtml = str_replace("&#151;", " - ", $newtablehtml);
$newtablehtml = str_replace("&acirc;&#128;&#153;", "'", $newtablehtml);
$newtablehtml = str_replace("&acirc;&#128;&#156;", "", $newtablehtml);
$newtablehtml = str_replace("&acirc;&#128;&#157;", "", $newtablehtml);
// Sometimes the old page still contains html doctype
// inside the content tag
if (strpos($newtablehtml, '<html>') !== 0) {
$newtablehtml = str_replace('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">', '', $newtablehtml);
}
// Remove empty paragraphs
$newtablehtml = preg_replace("#<p[^>]*>(\s|&nbsp;?)*</p>#", '', $newtablehtml);
// Now run htmLawed to clean up
$newtablehtml = htmLawed($newtablehtml, $config);
// Normalise post titles in all caps
if (strtoupper($post->post_title) == $post->post_title) {
$post->post_title = ucwords(strtolower($post->post_title));
}
if(strlen($newtablehtml) > 30) {
// Post name exists - save new post content only
if(strlen(trim($post->post_name)) > 0) {
if(TESTINGSINGLE == false || $post->ID == TESTID) {
$query2 = "UPDATE " . POSTTABLE . " SET post_content = '" . mysql_real_escape_string($newtablehtml) . "', post_title = '" . $post->post_title . "' WHERE ID = ". $post->ID;
mysql_query($query2);
}
}
// Need to generate post content from title
else {
$postname = strtolower(sanitize_file_name($post->post_title));
if(TESTINGSINGLE == false || $post->ID == TESTID) {
$query2 = "UPDATE " . POSTTABLE . " SET post_content = '" . mysql_real_escape_string($newtablehtml) . "', post_name ='" . mysql_real_escape_string($postname) . "',post_title = '" . $post->post_title . "' WHERE ID = ". $post->ID;
mysql_query($query2);
}
}
} else {
// Delete posts with v little or no content
$query3 = mysql_query("SELECT ID FROM " . POSTTABLE . " WHERE post_type IN ('post', 'page') AND post_status = 'publish' AND post_parent = " . $post->ID);
if(!mysql_fetch_object($query3)) {
if(TESTINGSINGLE == false || $post->ID == TESTID) {
mysql_query("DELETE FROM " . POSTTABLE . " WHERE ID = ". $post->ID);
}
}
}
}
// Taken from the WP function
function sanitize_file_name( $filename ) {
$filename_raw = $filename;
$special_chars = array("?", "[", "]", "/", "\\", "=", "<", ">", ":", ";", ",", "'", "\"", "&", "$", "#", "*", "(", ")", "|", "~", "`", "!", "{", "}", "–", "—","—", chr(0));
$filename = str_replace($special_chars, '', $filename);
$filename = preg_replace('/[\s-]+/', '-', $filename);
$filename = trim($filename, '.-_');
$entities = array("%e2", "%80", "%9c", "%9d", "%94", "%a0". "%93", "%99");
$filename = str_replace($entities, '', $filename);
$unique = 0;
$i = 0;
while (!$unique) {
$query = mysql_query("SELECT post_name FROM " . POSTTABLE . " WHERE post_name IN ('post', 'page') AND post_name = '" . $filename . "'");
if(mysql_fetch_object($query)) {
print('***not unique - ' . $filename);
$filename = $filename . '-' . $i;
$i = $i + 1;
}
else { $unique = 1; }
}
return $filename;
}
?> |