Commit inicial
This commit is contained in:
194
extraccion_html.c
Normal file
194
extraccion_html.c
Normal file
@@ -0,0 +1,194 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <libpq-fe.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <libxml/xpath.h>
|
||||
|
||||
// Define the alumno struct
|
||||
struct alumno {
|
||||
char apellido_paterno[100];
|
||||
char apellido_materno[100];
|
||||
char curp[20];
|
||||
char clave_carrera[2];
|
||||
char plan[2];
|
||||
char clave[6];
|
||||
char nombre[100];
|
||||
char correo[100];
|
||||
char estatus;
|
||||
char telefono[11];
|
||||
int semestre;
|
||||
char sexo;
|
||||
};
|
||||
|
||||
// Function to check for PostgreSQL connection errors
|
||||
void check_conn_status(PGconn *conn) {
|
||||
if (PQstatus(conn) != CONNECTION_OK) {
|
||||
fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn));
|
||||
PQfinish(conn);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
// Function to check for PostgreSQL query execution errors
|
||||
void check_exec_status(PGresult *res, PGconn *conn) {
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK) {
|
||||
fprintf(stderr, "Query failed: %s", PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
// Function to extract content from an HTML element by ID
|
||||
char* get_element_content_by_id(htmlDocPtr doc, const char *id) {
|
||||
xmlChar xpath[100];
|
||||
snprintf((char *)xpath, sizeof(xpath), "//*[@id='%s']", id);
|
||||
|
||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
||||
xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(xpath, xpathCtx);
|
||||
|
||||
if (xpathObj == NULL || xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) {
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
xmlNodePtr node = xpathObj->nodesetval->nodeTab[0];
|
||||
xmlChar *content = xmlNodeGetContent(node);
|
||||
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
|
||||
return (char *)content;
|
||||
}
|
||||
|
||||
// Function to parse HTML content using libxml2 and populate the alumno struct
|
||||
void parse_html(const char *html, struct alumno *alum) {
|
||||
htmlDocPtr doc = htmlReadMemory(html, strlen(html), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (doc == NULL) {
|
||||
fprintf(stderr, "Failed to parse HTML\n");
|
||||
return;
|
||||
}
|
||||
|
||||
char *content;
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApPatAlumnoHP");
|
||||
if (content) {
|
||||
strncpy(alum->apellido_paterno, content, 100);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApMatAlumnoHP");
|
||||
if (content) {
|
||||
strncpy(alum->apellido_materno, content, 100);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCURPAlumnoHP");
|
||||
if (content) {
|
||||
strncpy(alum->curp, content, 20);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveCarrera");
|
||||
if (content) {
|
||||
strncpy(alum->clave_carrera, content, 2);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblAlupla");
|
||||
if (content) {
|
||||
strncpy(alum->plan, content, 4);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveUlsa");
|
||||
if (content) {
|
||||
strncpy(alum->clave, content, 7);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblNombreAlumnoHP");
|
||||
if (content) {
|
||||
strncpy(alum->nombre, content, 100);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCorreoAlumnoHP");
|
||||
if (content) {
|
||||
strncpy(alum->correo, content, 100);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblStat");
|
||||
if (content) {
|
||||
alum->estatus = content[0];
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblTelefonoAlumnoHP");
|
||||
if (content) {
|
||||
strncpy(alum->telefono, content, 11);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblSem");
|
||||
if (content) {
|
||||
alum->semestre = atoi(content);
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblSexoAlumnoHP");
|
||||
if (content) {
|
||||
alum->sexo = content[0];
|
||||
xmlFree(content);
|
||||
}
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
}
|
||||
|
||||
int main() {
|
||||
// PostgreSQL connection parameters
|
||||
const char *conninfo = "dbname=sgi user=postgres password=h3rcul3s#$ hostaddr=200.13.89.8 port=5432";
|
||||
PGconn *conn = PQconnectdb(conninfo);
|
||||
|
||||
// Check connection status
|
||||
check_conn_status(conn);
|
||||
|
||||
// Execute SQL query to retrieve HTML content
|
||||
PGresult *res = PQexec(conn, "SELECT datos_html FROM public.alumno_extraccion WHERE error_message IS NULL");
|
||||
check_exec_status(res, conn);
|
||||
|
||||
// Process each row
|
||||
int rows = PQntuples(res);
|
||||
for (int i = 0; i < rows; i++) {
|
||||
char *html_content = PQgetvalue(res, i, 0);
|
||||
// printf("HTML Content: %s\n", html_content);
|
||||
|
||||
struct alumno alum;
|
||||
memset(&alum, 0, sizeof(alum)); // Initialize the struct to zero
|
||||
|
||||
parse_html(html_content, &alum);
|
||||
|
||||
printf("Apellido Paterno: %s\n", alum.apellido_paterno);
|
||||
printf("Apellido Materno: %s\n", alum.apellido_materno);
|
||||
printf("CURP: %s\n", alum.curp);
|
||||
printf("Clave Carrera: %s\n", alum.clave_carrera);
|
||||
printf("Plan: %s\n", alum.plan);
|
||||
printf("Clave: %s\n", alum.clave);
|
||||
printf("Nombre: %s\n", alum.nombre);
|
||||
printf("Correo: %s\n", alum.correo);
|
||||
printf("Estatus: %c\n", alum.estatus);
|
||||
printf("Telefono: %s\n", alum.telefono);
|
||||
printf("Semestre: %d\n", alum.semestre);
|
||||
printf("Sexo: %c\n", alum.sexo);
|
||||
}
|
||||
|
||||
// Clean up
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user