using System; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using BitMiracle.Docotic.Pdf; using Console = HyperCube.Utils.AdvConsole; namespace HyperCube { public class DocParse { public static Models.ArticleModel ReadPDF(MemoryStream ms) { //Console.WriteLine("ReadPDF start"); Models.ArticleModel articleModel = new(); string text = ""; Regex regex; MatchCollection matches; ms.Position = 0; //Console.WriteLine($"memorystream length: {ms.Length}, canread: {ms.CanRead}, canseek:{ms.CanSeek}"); PdfDocument pdf = new(ms); text = pdf.GetText(); //GetTextWithFormatting() ///getting article name regex = new Regex(@"^.*?[\.!\?](?:\s|$)"); matches = regex.Matches(text); if (matches.Count > 0) { string name = string.Join(", ", from Match match in matches select match.Value); articleModel.Name = name.Trim(); } else Console.WriteLine("cant get name"); ///getting publish date ///? /// getting authors regex = new Regex(@"[А-Я]\.\s?[А-Я]\.\s[А-Я][а-я]{1,20}"); matches = regex.Matches(text); if (matches.Count > 0) { //foreach (Match match in matches) Console.WriteLine(match.Value); string authors = string.Join(", ", from Match match in matches select match.Value); articleModel.Authors = authors; } else Console.WriteLine("authors not found"); ///getting keywords regex = new Regex(@"(ключевые)\s*(слова:)\s[\w+\-+\w\,\s]*\.", RegexOptions.IgnoreCase); matches = regex.Matches(text); if (matches.Count > 0) { string keywords = string.Join(", ", from Match match in matches select match.Value); keywords = Regex.Replace(keywords, @"\s+", " "); keywords = Regex.Replace(keywords, "-", ""); keywords = keywords.Substring(keywords.IndexOf(":") + 2); articleModel.Keywords = keywords; } else Console.WriteLine("keywords not found"); ///getting annotation ///? articleModel.Text = text; //Console.WriteLine("ReadPDF end"); return articleModel; } public static string ReadDocx() { StringBuilder pageText = new(); return pageText.ToString(); } } }