Leitgedanken/fetch_bibel/#func_get_lut1984_diebibel.php
2022-11-21 09:47:28 +01:00

164 lines
6.4 KiB
PHP

<?php
require_once("config.inc.php");
header("Content-Type: text/html; charset=utf-8");
/*
create table bibel_chapter_1984_html
as
SELECT DISTINCT anz_buch, buch, kapitel, NULL as verarbeitet FROM `bibel_lut_1984`;
ALTER TABLE `bibel_lut_1984_html` ADD PRIMARY KEY(`bid`);
ALTER TABLE `bibel_lut_1984_html` CHANGE `bid` `bid` INT(11) NOT NULL AUTO_INCREMENT;
ALTER TABLE `bibel_chapter_1984` CHANGE `verarbeitet` `verarbeitet` VARCHAR(1) NULL DEFAULT NULL;
ALTER TABLE `bibel_lut_1984_html` CHANGE `anz_buch` `anz_buch` VARCHAR(2) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, CHANGE `buch` `buch` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, CHANGE `vers` `vers` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, CHANGE `bibelstelle` `bibelstelle` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, CHANGE `inhalt` `inhalt` TEXT CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL;
*/
$db = dbconnect();
$db->query("SET NAMES 'utf8'");
$query = "SELECT cid, anz_buch, buch, kapitel, link
FROM `bibel_chapter_1984_html`
WHERE verarbeitet IS NULL
ORDER BY cid ASC
";
# $query = "SELECT cid, anz_buch, buch, kapitel, link
# FROM `bibel_chapter_1984_html`
# WHERE cid =1084
# ";
$result = $db->query( $query)
or die ("Cannot execute query: result");
while ($row = $result->fetch_array()){
#$url= "https://www.die-bibel.de/bibeln/online-bibeln/lutherbibel-1984/bibeltext/bibel/text/lesen/stelle/58/10001/19999/";
$handle = file_get_contents($row['link']);
$handle = strstr($handle, '<div class="bible-text-container">');
$handle = strstr($handle, '<nav class="bible-navigation bible-navigation--simple">', true);
# Alle Zeilenumbrüche entfernen
$handle = preg_replace('/\\r/', '', $handle);
$handle = preg_replace('/\\n/', '', $handle);
# Mehrere Leerzeichen entfernen
$handle= preg_replace('/\s{2,}/sm',' ',$handle,PREG_SET_ORDER); //Mehr als zwei leerzeichen entfernen
$handle = preg_replace ('#\s+#' , ' ' , $handle);
# Zeilenumbrüche bei Verse einfügen
$handle = str_replace("<span class=\"verse\"", "\n<span class=\"verse\"", $handle);
# Überschrift extrahieren <h1>
$handle = preg_replace('#(<li class="scripture">).*?(</li>)#', '', $handle);
$handle = preg_replace('#(<li class="name">)(.*?)(</li>)#', '<h1>$2</h1>', $handle);
# Teilüberschrift extrahieren <h3>
$handle = preg_replace('#(<h1 id=\"h[0-9]\">)(.*?)(</h1>)#', '<h3>$2</h3>', $handle);
$handle = str_replace("</h3>","</h3>\n",$handle);
$handle = str_replace("<h3>","\n<h3>",$handle);
# Kapitel entfernen
$handle = preg_replace('#(<span class="chapter">)(.*?)(</span>)#', '', $handle);
# Tags entfernen
$handle = preg_replace('#(<span class="verse".*?>)(.*?)(</span>)#', '$2', $handle);
$handle = preg_replace('#(<p>)(.*?)(</p>)#', '$2', $handle);
$handle = preg_replace('#(<div class="linebreak"></div>)#', '{LINEBREAK}', $handle);
# Hervorhebung des Worttes: HERR oder HERRN in besonder Schrift erhalten
$handle = preg_replace('#(<span class="name-of-deity">)(.*?)(</span>)#', '{NAMEOFDEITY}$2{/NAMEOFDEITY}', $handle);
# Tags entfernen
$handle = strip_tags($handle, '<h3><h1><strong><em>');
# Linebreak wieder hinzufügen
$handle = preg_replace('#({LINEBREAK})#', '<div class="linebreak"></div>', $handle);
$handle = preg_replace('#({/NAMEOFDEITY})#', '</span>', $handle);
$handle = preg_replace('#({NAMEOFDEITY})#', '<span class="name-of-deity">', $handle);
# Leerzeichen nach div-Tags entfernen
$handle = str_replace("</div> ","</div>",$handle);
# Doppeltes Fett löschen: Matth 6 z.B.
$handle = str_replace("</strong><strong>"," ",$handle);
# Doppelte Leerzeichen entfernen, wie oben, falls beim Strong entfernen doppelte dazukommen
$handle = str_replace (' ' , ' ' , $handle);
#echo $handle;
# Mehrere Zeilenumbrüche enternen
$handle = preg_replace('/(?:[ \t]*(?:\n|\r\n?)){2,}/', "\n", $handle);
# Bei Sirach gibt es Konstellationen wie 8&nbsp;\n[7] -> Das ist aber kein Zeilenumbruch, sondern gehört zu Vers 8 zb
$handle = preg_replace('#(\&nbsp\;)(\\n)(\[)#', '$1$3', $handle);
#echo $handle;
$data = explode("\n", $handle); // preg_split('#\n#', $data); Please don't
# print_r($data);
for($i=1; $i < count($data)-1; $i++){
$inhalt = explode('&nbsp;', $data[$i]);
$bibelstelle = "$row[anz_buch] $row[buch] $row[kapitel]";
if(isset($inhalt[1])){
$vers = trim($inhalt[0]);
$inhalt_neu = trim(addslashes(html_entity_decode ($inhalt[1],ENT_QUOTES)));
$bibelstelle .= ", $vers";
}else{
$vers = '';
$inhalt_neu = trim(addslashes(html_entity_decode($inhalt[0],ENT_QUOTES)));
}
$daten[$i][0] = $vers;
$daten[$i][1] = $inhalt_neu;
$sql1 = $db->query("INSERT INTO bibel_lut_1984_html ( anz_buch
, buch
, kapitel
, vers
, bibelstelle
, inhalt
)
VALUES
( '$row[anz_buch]'
, '$row[buch]'
, '$row[kapitel]'
, '$vers'
, '$bibelstelle'
, '$inhalt_neu'
)"
);
}
$sql2 = $db->query("UPDATE bibel_chapter_1984_html
SET verarbeitet='Y'
WHERE anz_buch='$row[anz_buch]'
AND buch = '$row[buch]'
AND kapitel = '$row[kapitel]'
");
#print_r($daten);
# Zufälltige Anzahl Sekunden pausieren
$random = rand(3,10);
sleep($random);
echo "UPDATE bibel_chapter_1984_html SET verarbeitet='Y' WHERE anz_buch='$row[anz_buch]' AND buch = '$row[buch]' AND kapitel = '$row[kapitel]'\t ->\t $random <br>";
}
?>