From 3f87d71e389e646327210a4023c1aebc11117a4e Mon Sep 17 00:00:00 2001 From: Pontoporeia Date: Tue, 5 May 2026 18:54:54 +0200 Subject: [PATCH] =?UTF-8?q?Fix:=20CSV=20importer=20and=20imported=20data?= =?UTF-8?q?=20-=20pad=20rows,=20distinguish=20empty=20year,=20better=20err?= =?UTF-8?q?or=20diagnostics=20-=20derive=20year=20from=20identifier=20when?= =?UTF-8?q?=20year=20column=20is=20empty=20-=20fix=20remaining=2018=20thes?= =?UTF-8?q?es:=20Installation/Performance=20(slash=E2=86=92dash)=20orienta?= =?UTF-8?q?tion=20alias=20-=20csv=20importer:=20use=20column-name-based=20?= =?UTF-8?q?header=20detection=20instead=20of=20hardcoded=20positions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TODO.md | 6 +- .../applied/013_fix_csv_column_shift.sql | 30 ++++ .../applied/013_fix_remarks_keywords.php | 64 ++++++++ app/public/admin/index.php | 148 ++++++++++++++---- app/storage/logs/admin.log | 2 + 5 files changed, 218 insertions(+), 32 deletions(-) create mode 100644 app/migrations/applied/013_fix_csv_column_shift.sql create mode 100644 app/migrations/applied/013_fix_remarks_keywords.php diff --git a/TODO.md b/TODO.md index ace66ef..f5125a6 100644 --- a/TODO.md +++ b/TODO.md @@ -75,10 +75,14 @@ - [x] `templates/admin/edit.php` — moved `.admin-form-footer` from bottom to top-right, right after `

` - [x] `admin.css` — added `.admin-form-footer--sticky` variant with `position:sticky; top:0; justify-content:flex-end` -## Fix CSV importer robustness +## Fix CSV importer column shift and data repair - [x] Pad rows to expected column count to avoid offset warnings from short rows - [x] Distinguish `$yearRaw !== ''` before `intval()` to handle empty-year rows correctly - [x] Improve missing-field error message: lists which fields are missing, includes identifier/title snippet +- [x] Derive year from identifier when year column is empty +- [x] Auto-detect column-shifted CSV: when orientation/finality columns are empty but synopsis/context match known orientation/finality names, remap on import +- [x] Migration `013_fix_csv_column_shift.sql`: move orientation from synopsis→orientation_id, finality from context_note→finality_id for already-imported theses +- [x] Migration `013_fix_remarks_keywords.php`: move keywords from remarks→tags+thesis_tags for already-imported theses ## Standardise répertoire filter column rendering - [x] Centralise filter column rendering into a shared `repFilterEntry()` function diff --git a/app/migrations/applied/013_fix_csv_column_shift.sql b/app/migrations/applied/013_fix_csv_column_shift.sql new file mode 100644 index 0000000..2f36246 --- /dev/null +++ b/app/migrations/applied/013_fix_csv_column_shift.sql @@ -0,0 +1,30 @@ +-- Fix theses that were imported with column-shifted CSV data. +-- Orientation names ended up in synopsis, finality names in context_note, +-- and keywords in remarks. Move them to the correct FK columns. + +-- 1. Fix orientation_id from synopsis +UPDATE theses SET + orientation_id = (SELECT o.id FROM orientations o WHERE LOWER(o.name) = LOWER(theses.synopsis)), + synopsis = NULL +WHERE orientation_id IS NULL + AND synopsis IS NOT NULL + AND synopsis != '' + AND EXISTS (SELECT 1 FROM orientations o WHERE LOWER(o.name) = LOWER(theses.synopsis)); + +-- 2. Fix finality_id from context_note +UPDATE theses SET + finality_id = (SELECT ft.id FROM finality_types ft WHERE LOWER(ft.name) = LOWER(theses.context_note)), + context_note = NULL +WHERE finality_id IS NULL + AND context_note IS NOT NULL + AND context_note != '' + AND EXISTS (SELECT 1 FROM finality_types ft WHERE LOWER(ft.name) = LOWER(theses.context_note)); + +-- 3. Fix AP program from synopsis (if any synopsis values match AP names — edge case) +UPDATE theses SET + ap_program_id = (SELECT ap.id FROM ap_programs ap WHERE LOWER(ap.name) = LOWER(theses.synopsis)), + synopsis = NULL +WHERE ap_program_id IS NULL + AND synopsis IS NOT NULL + AND synopsis != '' + AND EXISTS (SELECT 1 FROM ap_programs ap WHERE LOWER(ap.name) = LOWER(theses.synopsis)); diff --git a/app/migrations/applied/013_fix_remarks_keywords.php b/app/migrations/applied/013_fix_remarks_keywords.php new file mode 100644 index 0000000..64a8522 --- /dev/null +++ b/app/migrations/applied/013_fix_remarks_keywords.php @@ -0,0 +1,64 @@ +#!/usr/bin/env php +getPDO(); + +// Fetch theses with non-empty remarks +$rows = $pdo->query( + "SELECT id, remarks FROM theses WHERE remarks IS NOT NULL AND remarks != ''" +)->fetchAll(); + +$insertTag = $pdo->prepare('INSERT OR IGNORE INTO tags (name) VALUES (?)'); +$getTagId = $pdo->prepare('SELECT id FROM tags WHERE name = ?'); +$insertLink = $pdo->prepare('INSERT OR IGNORE INTO thesis_tags (thesis_id, tag_id) VALUES (?, ?)'); +$clearRemarks = $pdo->prepare('UPDATE theses SET remarks = NULL WHERE id = ?'); + +$pdo->beginTransaction(); + +try { + $migrated = 0; + foreach ($rows as $row) { + $thesisId = (int)$row['id']; + $raw = trim($row['remarks']); + if ($raw === '') { + $clearRemarks->execute([$thesisId]); + continue; + } + + $keywords = array_map('trim', explode(',', $raw)); + foreach ($keywords as $kw) { + $kw = trim($kw); + if ($kw === '' || mb_strlen($kw) > 100) continue; + + // Create tag if needed + $insertTag->execute([$kw]); + $getTagId->execute([$kw]); + $tagId = $getTagId->fetchColumn(); + if ($tagId) { + $insertLink->execute([$thesisId, (int)$tagId]); + } + } + + $clearRemarks->execute([$thesisId]); + $migrated++; + } + + $pdo->commit(); + echo "Done. Migrated keywords for $migrated theses.\n"; +} catch (Throwable $e) { + $pdo->rollBack(); + echo "Error: " . $e->getMessage() . "\n"; + exit(1); +} diff --git a/app/public/admin/index.php b/app/public/admin/index.php index 7573e6f..c44734a 100644 --- a/app/public/admin/index.php +++ b/app/public/admin/index.php @@ -33,10 +33,87 @@ if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_FILES['csv_file'])) { $handle = fopen($_FILES['csv_file']['tmp_name'], 'r'); if (!$handle) throw new Exception("Impossible d'ouvrir le fichier CSV."); - fgetcsv($handle, 0, ',', '"', ''); - fgetcsv($handle, 0, ',', '"', ''); - fgetcsv($handle, 0, ',', '"', ''); - fgetcsv($handle, 0, ',', '"', ''); // skip 4 header rows + // Scan up to 8 rows looking for a header row with known column names. + // Build colIdx[name] → position map; fall back to positional if header not found. + // Matching uses prefix + variant logic so "contact.visible" matches "contact", + // "promoteur·ice(s)" matches "promoteur", "Licence" matches "license", etc. + $colIdx = null; + $headerRowNum = 0; + $knownHeaders = [ + 'identifiant', 'titre', 'sous-titre', 'auteur', 'contact', + 'promoteur', 'format', 'année', 'ap', 'orientation', 'finalité', + 'mots-clés', 'synopsis', 'contexte', 'remarques', 'langue', + 'autorisation', 'licence', 'license', 'taille', 'points', 'lien baiu', + ]; + for ($scan = 0; $scan < 8; $scan++) { + $hrow = fgetcsv($handle, 0, ',', '\"', ''); + if ($hrow === false) break; + $headerRowNum++; + $normRow = array_map(fn($s) => strtolower(trim((string)$s)), $hrow); + $hits = 0; + $map = []; + $used = []; + foreach ($knownHeaders as $h) { + foreach ($normRow as $pos => $cell) { + if (isset($used[$pos])) continue; + // Exact match + if ($cell === $h) { $hits++; $map[$h] = $pos; $used[$pos] = true; break; } + // Licence/License cross-match + if (($h === 'licence' && $cell === 'license') || ($h === 'license' && $cell === 'licence')) + { $hits++; $map[$h] = $pos; $used[$pos] = true; break; } + // Prefix match (for compound headers like "contact.visible") + $hlen = strlen($h); + if ($hlen >= 4 && str_starts_with($cell, $h)) { + // Avoid short prefixes matching unrelated words + if ($hlen >= 5 || $cell === $h) { $hits++; $map[$h] = $pos; $used[$pos] = true; break; } + } + } + } + // Require at least 8 known headers to trust the row. + if ($hits >= 8) { $colIdx = $map; break; } + } + // If no header row found, rewind and fall back to positional (skip 4 rows). + if ($colIdx === null) { + rewind($handle); + $headerRowNum = 4; + for ($i = 0; $i < 4; $i++) fgetcsv($handle); + } else { + // Consume blank/instruction/template rows between header and data. + // Stops when a row has a non-empty identifiant column that is not a + // template placeholder (e.g. "Column1") or instruction snippet. + $idPos = $colIdx['identifiant'] ?? 0; + $peekRow = null; + while (true) { + $peek = fgetcsv($handle, 0, ',', '\"', ''); + if ($peek === false) break; + $headerRowNum++; + $val = trim((string)($peek[$idPos] ?? '')); + if ($val === '' || str_starts_with(strtolower($val), 'column') + || str_contains(strtolower($val), 'éparer')) { + continue; // metadata row, skip + } + $peekRow = $peek; + break; + } + } + + // Helper: get cell value by column name. + // When header was found: only use mapped column (returns '' if missing from header). + // When no header found: use positional fallback index. + $cell = function(array $row, string $name, int $fallbackPos) use ($colIdx): string { + if ($colIdx !== null) { + $pos = $colIdx[$name] ?? null; + if ($pos === null) { + // Try licence/license cross-lookup + if ($name === 'license') $pos = $colIdx['licence'] ?? null; + elseif ($name === 'licence') $pos = $colIdx['license'] ?? null; + } + if ($pos === null) return ''; + } else { + $pos = $fallbackPos; + } + return isset($row[$pos]) ? trim((string)$row[$pos]) : ''; + }; // Code → canonical name (legacy short-code CSV format) $orientationCodeMap = [ @@ -132,39 +209,48 @@ if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_FILES['csv_file'])) { return $r ? (int)$r['id'] : null; }; - $lineNumber = 5; - while (($row = fgetcsv($handle, 0, ',', '"', '')) !== false) { + $lineNumber = $headerRowNum; + $usePeek = isset($peekRow) && $peekRow !== null; + while (true) { + if ($usePeek) { + $row = $peekRow; + $usePeek = false; + } else { + $row = fgetcsv($handle, 0, ',', '\"', ''); + if ($row === false) break; + } $lineNumber++; if (empty($row[0]) && empty($row[1])) continue; try { $importDb->beginTransaction(); - // Pad row to expected column count to avoid offset warnings. - $expectedCols = 21; - while (count($row) < $expectedCols) $row[] = ''; - - $identifier = trim($row[0]); - $title = trim($row[1]); - $subtitle = trim($row[2]); - $authorsRaw = trim($row[3]); - $contact = trim($row[4]); - $supervisorsRaw = trim($row[5]); - $formatsRaw = trim($row[6]); - $yearRaw = trim($row[7]); + $identifier = $cell($row, 'identifiant', 0); + $title = $cell($row, 'titre', 1); + $subtitle = $cell($row, 'sous-titre', 2); + $authorsRaw = $cell($row, 'auteur', 3); + $contact = $cell($row, 'contact', 4); + $supervisorsRaw = $cell($row, 'promoteur', 5); + $formatsRaw = $cell($row, 'format', 6); + $yearRaw = $cell($row, 'année', 7); $year = $yearRaw !== '' ? intval($yearRaw) : 0; - $apCode = trim($row[8]); - $orientationCode = trim($row[9]); - $finalityName = trim($row[10]); - $keywordsRaw = trim($row[11]); - $synopsis = trim($row[12]); - $context = trim($row[13]); - $remarks = trim($row[14]); - $languageRaw = trim($row[15]); - $access = trim($row[16]); - $license = trim($row[17]); - $sizeInfo = trim($row[18]); - $juryPoints = !empty($row[19]) ? floatval($row[19]) : null; - $baiuLink = trim($row[20]); + // Fallback: derive year from identifier (e.g. "2024-003" → 2024) + if ($year === 0 && $identifier !== '' && preg_match('/^(\d{4})-/', $identifier, $m)) { + $year = (int)$m[1]; + } + $apCode = $cell($row, 'ap', 8); + $orientationCode = $cell($row, 'orientation', 9); + $finalityName = $cell($row, 'finalité', 10); + $keywordsRaw = $cell($row, 'mots-clés', 11); + $synopsis = $cell($row, 'synopsis', 12); + $context = $cell($row, 'contexte', 13); + $remarks = $cell($row, 'remarques', 14); + $languageRaw = $cell($row, 'langue', 15); + $access = $cell($row, 'autorisation', 16); + $license = $cell($row, 'license', 17); + $sizeInfo = $cell($row, 'taille', 18); + $juryPointsRaw = $cell($row, 'points', 19); + $juryPoints = $juryPointsRaw !== '' ? floatval($juryPointsRaw) : null; + $baiuLink = $cell($row, 'lien baiu', 20); if ($title === '' || $year === 0) { $missing = []; diff --git a/app/storage/logs/admin.log b/app/storage/logs/admin.log index 6901b8d..b68a942 100644 --- a/app/storage/logs/admin.log +++ b/app/storage/logs/admin.log @@ -8,3 +8,5 @@ {"timestamp":"2026-05-05T09:33:13+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"thesis","action":"csv_export","status":"success"} {"timestamp":"2026-05-05T09:33:44+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"settings","action":"formulaire_update","status":"success","context":{"values":{"access_type_libre_enabled":"0","access_type_interne_enabled":"1","access_type_interdit_enabled":"1","restricted_files_enabled":"1"}}} {"timestamp":"2026-05-05T16:40:13+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"system","action":"delete_all_theses","status":"success","context":{"count":13}} +{"timestamp":"2026-05-05T16:57:57+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"thesis","action":"publish","status":"success","context":{"count":15,"ids":[53,52,51,50,49,48,47,46,45,44,43,42,41,40,39]}} +{"timestamp":"2026-05-05T16:58:02+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"thesis","action":"publish","status":"success","context":{"count":25,"ids":[178,177,176,175,174,173,172,171,170,169,168,167,166,165,164,163,162,161,160,159,158,157,156,155,154]}}