164 lines
6.4 KiB
PHP
164 lines
6.4 KiB
PHP
<?php
|
|
require_once("config.inc.php");
|
|
header("Content-Type: text/html; charset=utf-8");
|
|
/*
|
|
create table bibel_chapter_1984_html
|
|
as
|
|
SELECT DISTINCT anz_buch, buch, kapitel, NULL as verarbeitet FROM `bibel_lut_1984`;
|
|
ALTER TABLE `bibel_lut_1984_html` ADD PRIMARY KEY(`bid`);
|
|
ALTER TABLE `bibel_lut_1984_html` CHANGE `bid` `bid` INT(11) NOT NULL AUTO_INCREMENT;
|
|
ALTER TABLE `bibel_chapter_1984` CHANGE `verarbeitet` `verarbeitet` VARCHAR(1) NULL DEFAULT NULL;
|
|
ALTER TABLE `bibel_lut_1984_html` CHANGE `anz_buch` `anz_buch` VARCHAR(2) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, CHANGE `buch` `buch` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, CHANGE `vers` `vers` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, CHANGE `bibelstelle` `bibelstelle` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, CHANGE `inhalt` `inhalt` TEXT CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL;
|
|
*/
|
|
|
|
$db = dbconnect();
|
|
$db->query("SET NAMES 'utf8'");
|
|
|
|
|
|
$query = "SELECT cid, anz_buch, buch, kapitel, link
|
|
FROM `bibel_chapter_1984_html`
|
|
WHERE verarbeitet IS NULL
|
|
ORDER BY cid ASC
|
|
";
|
|
|
|
# $query = "SELECT cid, anz_buch, buch, kapitel, link
|
|
# FROM `bibel_chapter_1984_html`
|
|
# WHERE cid =1084
|
|
# ";
|
|
|
|
|
|
$result = $db->query( $query)
|
|
or die ("Cannot execute query: result");
|
|
|
|
while ($row = $result->fetch_array()){
|
|
|
|
#$url= "https://www.die-bibel.de/bibeln/online-bibeln/lutherbibel-1984/bibeltext/bibel/text/lesen/stelle/58/10001/19999/";
|
|
|
|
$handle = file_get_contents($row['link']);
|
|
|
|
$handle = strstr($handle, '<div class="bible-text-container">');
|
|
$handle = strstr($handle, '<nav class="bible-navigation bible-navigation--simple">', true);
|
|
|
|
# Alle Zeilenumbrüche entfernen
|
|
$handle = preg_replace('/\\r/', '', $handle);
|
|
$handle = preg_replace('/\\n/', '', $handle);
|
|
|
|
# Mehrere Leerzeichen entfernen
|
|
$handle= preg_replace('/\s{2,}/sm',' ',$handle,PREG_SET_ORDER); //Mehr als zwei leerzeichen entfernen
|
|
$handle = preg_replace ('#\s+#' , ' ' , $handle);
|
|
|
|
# Zeilenumbrüche bei Verse einfügen
|
|
$handle = str_replace("<span class=\"verse\"", "\n<span class=\"verse\"", $handle);
|
|
|
|
|
|
|
|
# Überschrift extrahieren <h1>
|
|
$handle = preg_replace('#(<li class="scripture">).*?(</li>)#', '', $handle);
|
|
$handle = preg_replace('#(<li class="name">)(.*?)(</li>)#', '<h1>$2</h1>', $handle);
|
|
|
|
|
|
# Teilüberschrift extrahieren <h3>
|
|
$handle = preg_replace('#(<h1 id=\"h[0-9]\">)(.*?)(</h1>)#', '<h3>$2</h3>', $handle);
|
|
$handle = str_replace("</h3>","</h3>\n",$handle);
|
|
$handle = str_replace("<h3>","\n<h3>",$handle);
|
|
|
|
# Kapitel entfernen
|
|
$handle = preg_replace('#(<span class="chapter">)(.*?)(</span>)#', '', $handle);
|
|
|
|
# Tags entfernen
|
|
$handle = preg_replace('#(<span class="verse".*?>)(.*?)(</span>)#', '$2', $handle);
|
|
$handle = preg_replace('#(<p>)(.*?)(</p>)#', '$2', $handle);
|
|
$handle = preg_replace('#(<div class="linebreak"></div>)#', '{LINEBREAK}', $handle);
|
|
|
|
# Hervorhebung des Worttes: HERR oder HERRN in besonder Schrift erhalten
|
|
$handle = preg_replace('#(<span class="name-of-deity">)(.*?)(</span>)#', '{NAMEOFDEITY}$2{/NAMEOFDEITY}', $handle);
|
|
|
|
|
|
# Tags entfernen
|
|
$handle = strip_tags($handle, '<h3><h1><strong><em>');
|
|
|
|
|
|
|
|
# Linebreak wieder hinzufügen
|
|
$handle = preg_replace('#({LINEBREAK})#', '<div class="linebreak"></div>', $handle);
|
|
$handle = preg_replace('#({/NAMEOFDEITY})#', '</span>', $handle);
|
|
$handle = preg_replace('#({NAMEOFDEITY})#', '<span class="name-of-deity">', $handle);
|
|
|
|
# Leerzeichen nach div-Tags entfernen
|
|
$handle = str_replace("</div> ","</div>",$handle);
|
|
|
|
# Doppeltes Fett löschen: Matth 6 z.B.
|
|
$handle = str_replace("</strong><strong>"," ",$handle);
|
|
|
|
# Doppelte Leerzeichen entfernen, wie oben, falls beim Strong entfernen doppelte dazukommen
|
|
$handle = str_replace (' ' , ' ' , $handle);
|
|
|
|
#echo $handle;
|
|
|
|
# Mehrere Zeilenumbrüche enternen
|
|
$handle = preg_replace('/(?:[ \t]*(?:\n|\r\n?)){2,}/', "\n", $handle);
|
|
|
|
|
|
# Bei Sirach gibt es Konstellationen wie 8 \n[7] -> Das ist aber kein Zeilenumbruch, sondern gehört zu Vers 8 zb
|
|
$handle = preg_replace('#(\ \;)(\\n)(\[)#', '$1$3', $handle);
|
|
#echo $handle;
|
|
|
|
|
|
$data = explode("\n", $handle); // preg_split('#\n#', $data); Please don't
|
|
|
|
# print_r($data);
|
|
|
|
for($i=1; $i < count($data)-1; $i++){
|
|
$inhalt = explode(' ', $data[$i]);
|
|
$bibelstelle = "$row[anz_buch] $row[buch] $row[kapitel]";
|
|
|
|
if(isset($inhalt[1])){
|
|
$vers = trim($inhalt[0]);
|
|
$inhalt_neu = trim(addslashes(html_entity_decode ($inhalt[1],ENT_QUOTES)));
|
|
$bibelstelle .= ", $vers";
|
|
}else{
|
|
$vers = '';
|
|
$inhalt_neu = trim(addslashes(html_entity_decode($inhalt[0],ENT_QUOTES)));
|
|
}
|
|
|
|
$daten[$i][0] = $vers;
|
|
$daten[$i][1] = $inhalt_neu;
|
|
|
|
|
|
$sql1 = $db->query("INSERT INTO bibel_lut_1984_html ( anz_buch
|
|
, buch
|
|
, kapitel
|
|
, vers
|
|
, bibelstelle
|
|
, inhalt
|
|
)
|
|
VALUES
|
|
( '$row[anz_buch]'
|
|
, '$row[buch]'
|
|
, '$row[kapitel]'
|
|
, '$vers'
|
|
, '$bibelstelle'
|
|
, '$inhalt_neu'
|
|
)"
|
|
);
|
|
}
|
|
|
|
|
|
$sql2 = $db->query("UPDATE bibel_chapter_1984_html
|
|
SET verarbeitet='Y'
|
|
WHERE anz_buch='$row[anz_buch]'
|
|
AND buch = '$row[buch]'
|
|
AND kapitel = '$row[kapitel]'
|
|
");
|
|
|
|
#print_r($daten);
|
|
|
|
# Zufälltige Anzahl Sekunden pausieren
|
|
$random = rand(3,10);
|
|
sleep($random);
|
|
|
|
echo "UPDATE bibel_chapter_1984_html SET verarbeitet='Y' WHERE anz_buch='$row[anz_buch]' AND buch = '$row[buch]' AND kapitel = '$row[kapitel]'\t ->\t $random <br>";
|
|
|
|
}
|
|
?>
|