195 lines
5.9 KiB
C
195 lines
5.9 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <libpq-fe.h>
|
|
#include <libxml/HTMLparser.h>
|
|
#include <libxml/xpath.h>
|
|
|
|
// Define the alumno struct
|
|
struct alumno {
|
|
char apellido_paterno[100];
|
|
char apellido_materno[100];
|
|
char curp[20];
|
|
char clave_carrera[2];
|
|
char plan[2];
|
|
char clave[6];
|
|
char nombre[100];
|
|
char correo[100];
|
|
char estatus;
|
|
char telefono[11];
|
|
int semestre;
|
|
char sexo;
|
|
};
|
|
|
|
// Function to check for PostgreSQL connection errors
|
|
void check_conn_status(PGconn *conn) {
|
|
if (PQstatus(conn) != CONNECTION_OK) {
|
|
fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn));
|
|
PQfinish(conn);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
// Function to check for PostgreSQL query execution errors
|
|
void check_exec_status(PGresult *res, PGconn *conn) {
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK) {
|
|
fprintf(stderr, "Query failed: %s", PQerrorMessage(conn));
|
|
PQclear(res);
|
|
PQfinish(conn);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
// Function to extract content from an HTML element by ID
|
|
char* get_element_content_by_id(htmlDocPtr doc, const char *id) {
|
|
xmlChar xpath[100];
|
|
snprintf((char *)xpath, sizeof(xpath), "//*[@id='%s']", id);
|
|
|
|
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
|
xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(xpath, xpathCtx);
|
|
|
|
if (xpathObj == NULL || xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) {
|
|
xmlXPathFreeObject(xpathObj);
|
|
xmlXPathFreeContext(xpathCtx);
|
|
return NULL;
|
|
}
|
|
|
|
xmlNodePtr node = xpathObj->nodesetval->nodeTab[0];
|
|
xmlChar *content = xmlNodeGetContent(node);
|
|
|
|
xmlXPathFreeObject(xpathObj);
|
|
xmlXPathFreeContext(xpathCtx);
|
|
|
|
return (char *)content;
|
|
}
|
|
|
|
// Function to parse HTML content using libxml2 and populate the alumno struct
|
|
void parse_html(const char *html, struct alumno *alum) {
|
|
htmlDocPtr doc = htmlReadMemory(html, strlen(html), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
|
if (doc == NULL) {
|
|
fprintf(stderr, "Failed to parse HTML\n");
|
|
return;
|
|
}
|
|
|
|
char *content;
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApPatAlumnoHP");
|
|
if (content) {
|
|
strncpy(alum->apellido_paterno, content, 100);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApMatAlumnoHP");
|
|
if (content) {
|
|
strncpy(alum->apellido_materno, content, 100);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCURPAlumnoHP");
|
|
if (content) {
|
|
strncpy(alum->curp, content, 20);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveCarrera");
|
|
if (content) {
|
|
strncpy(alum->clave_carrera, content, 2);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblAlupla");
|
|
if (content) {
|
|
strncpy(alum->plan, content, 4);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveUlsa");
|
|
if (content) {
|
|
strncpy(alum->clave, content, 7);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblNombreAlumnoHP");
|
|
if (content) {
|
|
strncpy(alum->nombre, content, 100);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCorreoAlumnoHP");
|
|
if (content) {
|
|
strncpy(alum->correo, content, 100);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblStat");
|
|
if (content) {
|
|
alum->estatus = content[0];
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblTelefonoAlumnoHP");
|
|
if (content) {
|
|
strncpy(alum->telefono, content, 11);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblSem");
|
|
if (content) {
|
|
alum->semestre = atoi(content);
|
|
xmlFree(content);
|
|
}
|
|
|
|
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblSexoAlumnoHP");
|
|
if (content) {
|
|
alum->sexo = content[0];
|
|
xmlFree(content);
|
|
}
|
|
|
|
xmlFreeDoc(doc);
|
|
}
|
|
|
|
int main() {
|
|
// PostgreSQL connection parameters
|
|
const char *conninfo = "dbname=sgi user=postgres password=h3rcul3s#$ hostaddr=200.13.89.8 port=5432";
|
|
PGconn *conn = PQconnectdb(conninfo);
|
|
|
|
// Check connection status
|
|
check_conn_status(conn);
|
|
|
|
// Execute SQL query to retrieve HTML content
|
|
PGresult *res = PQexec(conn, "SELECT datos_html FROM public.alumno_extraccion WHERE error_message IS NULL");
|
|
check_exec_status(res, conn);
|
|
|
|
// Process each row
|
|
int rows = PQntuples(res);
|
|
for (int i = 0; i < rows; i++) {
|
|
char *html_content = PQgetvalue(res, i, 0);
|
|
// printf("HTML Content: %s\n", html_content);
|
|
|
|
struct alumno alum;
|
|
memset(&alum, 0, sizeof(alum)); // Initialize the struct to zero
|
|
|
|
parse_html(html_content, &alum);
|
|
|
|
printf("Apellido Paterno: %s\n", alum.apellido_paterno);
|
|
printf("Apellido Materno: %s\n", alum.apellido_materno);
|
|
printf("CURP: %s\n", alum.curp);
|
|
printf("Clave Carrera: %s\n", alum.clave_carrera);
|
|
printf("Plan: %s\n", alum.plan);
|
|
printf("Clave: %s\n", alum.clave);
|
|
printf("Nombre: %s\n", alum.nombre);
|
|
printf("Correo: %s\n", alum.correo);
|
|
printf("Estatus: %c\n", alum.estatus);
|
|
printf("Telefono: %s\n", alum.telefono);
|
|
printf("Semestre: %d\n", alum.semestre);
|
|
printf("Sexo: %c\n", alum.sexo);
|
|
}
|
|
|
|
// Clean up
|
|
PQclear(res);
|
|
PQfinish(conn);
|
|
|
|
return 0;
|
|
}
|
|
|