2012-06-28 66 views
5

我在OpenXML(C#)中解析* .docx文档时遇到了一个问题。从DOCX中提取表格

所以,这里是我的步骤:
1.装入*的.docx文件
2.收到段落的列表
3.在每个段落中查找文本,图像和表格元素
4.对于每个文字和图像元素创建html标签
5.将输出保存为* .html文件

我已经找到了如何在文档中定位图像文件并将其提取出来。 现在有一个步骤 - 找到文本(段落)中的表格位置。

如果有人知道如何使用OpenXML查找* .docx文件中的表格,请帮忙。 谢谢。

附加: 好的,可能是我不清楚解释我的意思。 如果我们得到段落的内容,你可以找到作为文本块,图片等chield对象。 因此,如果段落包含运行包含图片它意味着在这个地方在Word文档中放置图像。

样品我的功能:

public static string ParseDocxDocument(string pathToFile) 
    { 
     StringBuilder result = new StringBuilder(); 
     WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true); 
     List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList(); 
     IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>(); 
     int imgCounter = 0; 


     foreach (Paragraph par in paragraphElement) 
     { 

       //Add new paragraph tag 
       result.Append("<div style=\"width:100%; text-align:"); 

       //Append anchor style 
       if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null) 
        switch (par.ParagraphProperties.Justification.Val.Value) 
        { 
         case JustificationValues.Left: 
          result.Append("left;"); 
          break; 
         case JustificationValues.Center: 
          result.Append("center;"); 
          break; 
         case JustificationValues.Both: 
          result.Append("justify;"); 
          break; 
         case JustificationValues.Right: 
         default: 
          result.Append("right;"); 
          break; 
        } 
       else 
        result.Append("left;"); 

       //Append text decoration style 
       if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren) 
        foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements) 
        { 
         switch (chield.GetType().Name) 
         { 
          case "Bold": 
           result.Append("font-weight:bold;"); 
           break; 
          case "Underline": 
           result.Append("text-decoration:underline;"); 
           break; 
          case "Italic": 
           result.Append("font-style:italic;"); 
           break; 
          case "FontSize": 
           result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;"); 
           break; 
          default: break; 
         } 
        } 

       result.Append("\">"); 

       //Add image tag 
       IEnumerable<Run> runs = par.Descendants<Run>(); 
       foreach (Run run in runs) 
       { 
        if (run.HasChildren) 
        { 
         foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture")) 
         { 
          result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()), 
              ((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style 
           )); 
          imgCounter++; 
         } 
        } 
       } 

       //Append inner text 
       IEnumerable<Text> textElement = par.Descendants<Text>(); 
       if (par.Descendants<Text>().Count() == 0) 
        result.Append("<br />"); 

       foreach (Text t in textElement) 
       { 
        result.Append(t.Text); 
       } 


       result.Append("</div>"); 
       result.Append(Environment.NewLine); 

     } 

     wordProcessingDoc.Close(); 

     return result.ToString(); 
    } 

现在我whant指定文本表处(因为它出现在Word中)。

决赛:

好,大家好,我已经找到了。在我的示例函数中有一个大错误。我列举了文档Body的段落元素。表与段落处于同一级别,所以函数忽略表。所以我们需要列举文档Body的元素。

这里是我的测试功能,生成的docx正确的HTML(这只是测试代码,所以它不是干净)以下内容,你的第一个表的文档中

public static string ParseDocxDocument(string pathToFile) 
    { 
     StringBuilder result = new StringBuilder(); 
     WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true); 
     List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList(); 
     List<string> tableCellContent = new List<string>(); 
     IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>(); 
     int imgCounter = 0; 

     foreach (OpenXmlElement section in wordProcessingDoc.MainDocumentPart.Document.Body.Elements<OpenXmlElement>()) 
     { 
      if(section.GetType().Name == "Paragraph") 
      { 
       Paragraph par = (Paragraph)section; 
       //Add new paragraph tag 
       result.Append("<div style=\"width:100%; text-align:"); 

       //Append anchor style 
       if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null) 
        switch (par.ParagraphProperties.Justification.Val.Value) 
        { 
         case JustificationValues.Left: 
          result.Append("left;"); 
          break; 
         case JustificationValues.Center: 
          result.Append("center;"); 
          break; 
         case JustificationValues.Both: 
          result.Append("justify;"); 
          break; 
         case JustificationValues.Right: 
         default: 
          result.Append("right;"); 
          break; 
        } 
       else 
        result.Append("left;"); 

       //Append text decoration style 
       if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren) 
        foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements) 
        { 
         switch (chield.GetType().Name) 
         { 
          case "Bold": 
           result.Append("font-weight:bold;"); 
           break; 
          case "Underline": 
           result.Append("text-decoration:underline;"); 
           break; 
          case "Italic": 
           result.Append("font-style:italic;"); 
           break; 
          case "FontSize": 
           result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;"); 
           break; 
          default: break; 
         } 
        } 

       result.Append("\">"); 

       //Add image tag 
       IEnumerable<Run> runs = par.Descendants<Run>(); 
       foreach (Run run in runs) 
       { 
        if (run.HasChildren) 
        { 
         foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture")) 
         { 
          result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()), 
              ((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style 
           )); 
          imgCounter++; 
         } 
         foreach (OpenXmlElement table in run.ChildElements.Where(o => o.GetType().Name == "Table")) 
         { 
          result.Append("<strong>HERE'S TABLE</strong>"); 
         } 
        } 
       } 

       //Append inner text 
       IEnumerable<Text> textElement = par.Descendants<Text>(); 
       if (par.Descendants<Text>().Count() == 0) 
        result.Append("<br />"); 

       foreach (Text t in textElement.Where(o=>!tableCellContent.Contains(o.Text.Trim()))) 
       { 
        result.Append(t.Text); 
       } 


       result.Append("</div>"); 
       result.Append(Environment.NewLine); 

      } 
      else if (section.GetType().Name=="Table") 
      { 
       result.Append("<table>"); 
       Table tab = (Table)section; 
       foreach (TableRow row in tab.Descendants<TableRow>()) 
       { 
        result.Append("<tr>"); 
        foreach (TableCell cell in row.Descendants<TableCell>()) 
        { 
         result.Append("<td>"); 
         result.Append(cell.InnerText); 
         tableCellContent.Add(cell.InnerText.Trim()); 
         result.Append("</td>"); 
        } 
        result.Append("</tr>"); 
       } 
       result.Append("</table>"); 
      }     
     } 


     wordProcessingDoc.Close(); 

     return result.ToString(); 
    } 

    private static string GetBase64Image(Stream inputData) 
    { 
     byte[] data = new byte[inputData.Length]; 
     inputData.Read(data, 0, data.Length); 
     return Convert.ToBase64String(data); 
    } 

回答

1

尝试。

Table table = doc.MainDocumentPart.Document.Body.Elements<Table>().First(); 
+0

我知道如何读取和解析表。我的问题是如何找到文体 – EkzoMan

+0

我已经添加了我的工作代码。你的文章给了我正确的工作方向,所以我将你的答案标记为正确 – EkzoMan