0

I have a database that contains more than 640,000 records that I update every week with data from a JSON file. What I want to do is only load records into the database that do not currently exists. My script below works on small amounts of data but when I try to load a large file it times out (I get a 500 Internal Server Error). Is there a better way to do this?

<?php
set_time_limit(0);
ini_set('memory_limit','2000M');

$url = 'json/OERrecordstest.json';
$contents = file_get_contents($url);
$records = json_decode($contents, true);
include("../config.php"); 

echo "<div class='card card-body'>";
foreach($records as $record) {  

    $type = $record['type'];
    $name = $record['title'];
    $title = addslashes($name);
    $creator = $record['author'];
    $author = addslashes($creator);
    $link = addslashes($record['link']);
    $origin = $record['source'];
    $source = addslashes($origin);
    $description = addslashes($record['description']);
    $base_url = $record['base_url'];
    $isbn_number = $record['isbn_number'];
    $e_isbn_number = $record['e_isbn_number'];
    $publication_date = $record['publication_date'];
    $license = $record['license'];
    $subject = addslashes($record['subject']);
    $image_url = $record['image_url'];
    $review = $record['review'];
    $language = $record['language'];
    $license_url = $record['license_url'];
    $publisher = addslashes($record['publisher']);
    $publisher_url = $record['publisher_url'];

    $query = $conn->prepare("SELECT * FROM oer_search WHERE title=:title AND author=:author AND source=:source");
    $query->bindParam(":title", $name);
    $query->bindParam(":author", $creator);
    $query->bindParam(":source", $origin);
    $query->execute();

    if ($query->rowCount() == 0) {
        $insert = $conn->prepare("INSERT INTO oer_search (type, title, author, link, source, description, base_url, isbn_number, e_isbn_number, publication_date, license, subject, image_url, review, language, license_url, publisher, publisher_url) VALUES ('$type', '$title', '$author', '$link', '$source', '$description', '$base_url', '$isbn_number', '$e_isbn_number', '$publication_date', '$license', '$subject', '$image_url', '$review', '$language', '$license_url', '$publisher', '$publisher_url')");
        $insert->execute();
    }

}

if($insert){
    echo "<p><span class='recordInserted'><em>$name was successfully inserted into SOAR.</em></span></p>";
}
else {
    echo "<p><span class='recordInserted'><em>Record(s) already exist in SOAR.</em></span></p>";
}
echo "</div>";
?>
brawlins4
  • 302
  • 6
  • 20

2 Answers2

-1

I could not comment, I wrote as an answer because my score was not enough. can you change it like this and try it?

 $query = $conn->prepare("SELECT id FROM oer_search WHERE title=:title AND author=:author AND source=:source limit 1");

or

<?php
if(!session_id()) session_start();
ini_set('memory_limit', '2000M');

$url = 'json/OERrecordstest.json';
$contents = file_get_contents($url);
$records = json_decode($contents, true);
include("../config.php");

echo "<div class='card card-body'>";

if (!$_SESSION["records"]) {
    foreach ($records as $record) {
        $_SESSION["records"][$record["id"]] = $records;
    }
}
$i = 0;
foreach ($_SESSION["records"] as $record) {
    $i++;
    if ($i > 1000) break;

    $type = $record['type'];
    $name = $record['title'];
    $title = addslashes($name);
    $creator = $record['author'];
    $author = addslashes($creator);
    $link = addslashes($record['link']);
    $origin = $record['source'];
    $source = addslashes($origin);
    $description = addslashes($record['description']);
    $base_url = $record['base_url'];
    $isbn_number = $record['isbn_number'];
    $e_isbn_number = $record['e_isbn_number'];
    $publication_date = $record['publication_date'];
    $license = $record['license'];
    $subject = addslashes($record['subject']);
    $image_url = $record['image_url'];
    $review = $record['review'];
    $language = $record['language'];
    $license_url = $record['license_url'];
    $publisher = addslashes($record['publisher']);
    $publisher_url = $record['publisher_url'];

    $query = $conn->prepare("SELECT id FROM oer_search WHERE title=:title AND author=:author AND source=:source limit 1");
    $query->bindParam(":title", $name);
    $query->bindParam(":author", $creator);
    $query->bindParam(":source", $origin);
    $query->execute();

    if ($query->rowCount() == 0) {
        $insert = $conn->prepare("INSERT INTO oer_search (type, title, author, link, source, description, base_url, isbn_number, e_isbn_number, publication_date, license, subject, image_url, review, language, license_url, publisher, publisher_url) VALUES ('$type', '$title', '$author', '$link', '$source', '$description', '$base_url', '$isbn_number', '$e_isbn_number', '$publication_date', '$license', '$subject', '$image_url', '$review', '$language', '$license_url', '$publisher', '$publisher_url')");
        $insert->execute();

        unset($_SESSION["records"][$record["id"]]);
    }

}

print "remaining data :". count($_SESSION["records"]);
?>
-1

Tipps to speed up mass-imports:

  • Move your SQL prepare outside of the loop (you only have to do it once)
  • Collect data to insert into batches of 1000 (for example.. usually alot more possible)
  • Use transactions / disable Index calculation during insert
  • Find duplicates with a lookup array from existing data (don't query the database for each row of your import)
  • In general: Avoid SQL queries in Loops

hope that helps a bit

Lars Stegelitz
  • 1,592
  • 9
  • 17