123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- using BitMiracle.Docotic.Pdf;
- using Microsoft.AspNetCore.Components.Forms;
- using System;
- using System.Collections.Generic;
- using System.IO;
- using System.Linq;
- using System.Text;
- using System.Text.RegularExpressions;
- using System.Threading.Tasks;
- namespace HyperCube
- {
- public class DocParse
- {
- private static readonly long MAX_FILE_SIZE = 5120000;
- public static async Task<Dictionary<string, string>> ReadPDF(IBrowserFile file)
- {
- Console.WriteLine("ReadPDF start");
- Dictionary<string, string> docFields = new()
- {
- { "name", "" },
- { "date", "" },
- { "authors", "" },
- { "keywords", "" },
- { "annotation", "" },
- { "text", "" }
- };
- string text = "";
- Regex regex;
- MatchCollection matches;
- Stream stream = file.OpenReadStream(MAX_FILE_SIZE);
- MemoryStream ms = new();
- await stream.CopyToAsync(ms);
- ms.Position = 0;
- Console.WriteLine($"stream length: {ms.Length}, canread: {ms.CanRead}, canseek:{ms.CanSeek}");
- PdfDocument pdf = new(ms);
- text = pdf.GetText();
-
-
-
- regex = new Regex(@"[А-Я]\.\s?[А-Я]\.\s[А-Я][а-я]{1,20}");
- matches = regex.Matches(text);
- if (matches.Count > 0)
- {
-
-
- string authors = string.Join(", ", from Match match in matches select match.Value);
- docFields["authors"] = authors;
- }
- else
- Console.WriteLine("authors not found");
-
-
-
-
- regex = new Regex(@"(ключевые)\s*(слова:)\s[\w+\-+\w\,\s]*\.", RegexOptions.IgnoreCase);
- matches = regex.Matches(text);
- if (matches.Count > 0)
- {
-
-
- string keywords = string.Join(", ", from Match match in matches select match.Value);
- keywords = Regex.Replace(keywords, @"\s+", " ");
- keywords = Regex.Replace(keywords, "-", "");
- keywords = keywords.Substring(keywords.IndexOf(":") + 2);
- docFields["keywords"] = keywords;
- }
- else
- Console.WriteLine("keywords not found");
-
-
- docFields["text"] = text;
- Console.WriteLine("ReadPDF end");
- return docFields;
- }
- public string ReadDocx()
- {
- StringBuilder pageText = new();
- return pageText.ToString();
- }
- private Dictionary<string, string> ParseTextData(string txtdata)
- {
- Dictionary<string, string> docAtributes = new();
- return docAtributes;
- }
- }
- }
|